Remove padal from quantized microkernel names.

- PADAL is the only variant for these microkernels.
- Remove from file names and function names.

PiperOrigin-RevId: 409461961
diff --git a/test/qc8-gemm-minmax-fp32.cc b/test/qc8-gemm-minmax-fp32.cc
index 1071145..d16abb9 100644
--- a/test/qc8-gemm-minmax-fp32.cc
+++ b/test/qc8-gemm-minmax-fp32.cc
@@ -23,7 +23,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33,10 +33,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -47,10 +47,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -61,10 +61,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -77,12 +77,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -94,11 +94,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -110,11 +110,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -125,11 +125,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -141,11 +141,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -159,13 +159,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -176,11 +176,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -192,11 +192,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -210,13 +210,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -227,11 +227,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -243,11 +243,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -261,13 +261,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -279,12 +279,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -297,12 +297,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -315,12 +315,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -334,13 +334,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -352,12 +352,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -370,12 +370,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -388,12 +388,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -407,13 +407,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -428,13 +428,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -445,10 +445,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -459,10 +459,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -473,13 +473,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -489,10 +489,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -503,10 +503,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -517,10 +517,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -533,12 +533,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -550,11 +550,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -566,11 +566,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -581,11 +581,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -597,11 +597,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -615,13 +615,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -632,11 +632,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -648,11 +648,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -666,13 +666,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -683,11 +683,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -699,11 +699,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -717,13 +717,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -735,12 +735,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -753,12 +753,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -771,12 +771,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -790,13 +790,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -808,12 +808,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -826,12 +826,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -844,12 +844,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -863,13 +863,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -884,13 +884,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -901,10 +901,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -915,10 +915,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -929,13 +929,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -945,10 +945,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -959,10 +959,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -973,10 +973,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -989,12 +989,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -1006,11 +1006,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1022,11 +1022,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1037,11 +1037,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1053,11 +1053,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1071,13 +1071,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1088,11 +1088,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1104,11 +1104,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1122,13 +1122,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1139,11 +1139,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1155,11 +1155,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1173,13 +1173,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1191,12 +1191,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1209,12 +1209,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1227,12 +1227,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1246,13 +1246,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1264,12 +1264,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1282,12 +1282,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1300,12 +1300,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1319,13 +1319,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1340,13 +1340,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -1357,10 +1357,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -1371,10 +1371,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -1385,13 +1385,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1401,10 +1401,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1415,10 +1415,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1429,10 +1429,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1445,12 +1445,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -1462,11 +1462,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1478,11 +1478,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1493,11 +1493,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1509,11 +1509,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1527,13 +1527,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1544,11 +1544,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1560,11 +1560,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1578,13 +1578,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1595,11 +1595,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1611,11 +1611,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1629,13 +1629,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1647,12 +1647,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1665,12 +1665,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1683,12 +1683,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1702,13 +1702,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1720,12 +1720,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1738,12 +1738,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1756,12 +1756,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1775,13 +1775,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1796,13 +1796,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1813,10 +1813,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1827,10 +1827,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1841,13 +1841,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1857,10 +1857,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1871,10 +1871,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1885,10 +1885,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1901,12 +1901,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -1918,11 +1918,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1934,11 +1934,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1949,11 +1949,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1965,11 +1965,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1983,13 +1983,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2000,11 +2000,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2016,11 +2016,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2034,13 +2034,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2051,11 +2051,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2067,11 +2067,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2085,13 +2085,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2103,12 +2103,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2121,12 +2121,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2139,12 +2139,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2158,13 +2158,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2176,12 +2176,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2194,12 +2194,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2212,12 +2212,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2231,13 +2231,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2252,13 +2252,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2269,10 +2269,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2283,10 +2283,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2297,13 +2297,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2313,10 +2313,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2327,10 +2327,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2341,10 +2341,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -2357,12 +2357,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -2374,11 +2374,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -2390,11 +2390,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2405,11 +2405,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2421,11 +2421,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2439,13 +2439,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2456,11 +2456,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2472,11 +2472,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2490,13 +2490,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2507,11 +2507,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2523,11 +2523,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2541,13 +2541,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2559,12 +2559,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2577,12 +2577,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2595,12 +2595,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2614,13 +2614,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2632,12 +2632,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2650,12 +2650,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2668,12 +2668,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2687,13 +2687,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2708,13 +2708,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2725,10 +2725,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2739,10 +2739,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2753,13 +2753,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -2769,10 +2769,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -2783,10 +2783,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -2797,10 +2797,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -2813,12 +2813,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -2830,11 +2830,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -2846,11 +2846,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2861,11 +2861,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2877,11 +2877,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2895,13 +2895,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2912,11 +2912,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2928,11 +2928,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2946,13 +2946,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2963,11 +2963,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2979,11 +2979,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2997,13 +2997,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3015,12 +3015,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3033,12 +3033,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3051,12 +3051,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3070,13 +3070,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3088,12 +3088,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3106,12 +3106,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3124,12 +3124,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3143,13 +3143,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3164,13 +3164,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -3181,10 +3181,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -3195,10 +3195,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -3209,13 +3209,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3225,10 +3225,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3239,10 +3239,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3253,10 +3253,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3269,12 +3269,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -3286,11 +3286,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3302,11 +3302,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3317,11 +3317,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3333,11 +3333,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3351,13 +3351,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3368,11 +3368,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3384,11 +3384,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3402,13 +3402,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3419,11 +3419,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3435,11 +3435,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3453,13 +3453,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3471,12 +3471,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3489,12 +3489,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3507,12 +3507,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3526,13 +3526,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3544,12 +3544,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3562,12 +3562,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3580,12 +3580,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3599,13 +3599,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3620,13 +3620,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3637,10 +3637,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3651,10 +3651,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3665,13 +3665,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3681,10 +3681,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3695,10 +3695,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3709,10 +3709,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3725,12 +3725,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -3742,11 +3742,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3758,11 +3758,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3773,11 +3773,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3789,11 +3789,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3807,13 +3807,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3824,11 +3824,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3840,11 +3840,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3858,13 +3858,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3875,11 +3875,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3891,11 +3891,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3909,13 +3909,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3927,12 +3927,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3945,12 +3945,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3963,12 +3963,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3982,13 +3982,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4000,12 +4000,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4018,12 +4018,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4036,12 +4036,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4055,13 +4055,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4076,13 +4076,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4093,10 +4093,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4107,10 +4107,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4121,13 +4121,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4137,10 +4137,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4151,10 +4151,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4165,10 +4165,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4181,12 +4181,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -4198,11 +4198,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4214,11 +4214,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4229,11 +4229,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4245,11 +4245,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4263,13 +4263,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4280,11 +4280,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4296,11 +4296,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4314,13 +4314,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4331,11 +4331,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4347,11 +4347,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4365,13 +4365,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4383,12 +4383,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4401,12 +4401,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4419,12 +4419,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4438,13 +4438,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4456,12 +4456,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4474,12 +4474,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4492,12 +4492,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4511,13 +4511,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4532,13 +4532,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4549,10 +4549,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4563,10 +4563,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4577,13 +4577,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -4593,10 +4593,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -4607,10 +4607,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -4621,10 +4621,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4637,12 +4637,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -4654,11 +4654,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4670,11 +4670,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4685,11 +4685,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4701,11 +4701,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4719,13 +4719,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4736,11 +4736,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4752,11 +4752,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4770,13 +4770,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4787,11 +4787,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4803,11 +4803,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4821,13 +4821,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4839,12 +4839,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4857,12 +4857,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4875,12 +4875,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4894,13 +4894,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4912,12 +4912,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4930,12 +4930,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4948,12 +4948,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4967,13 +4967,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4988,13 +4988,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -5005,10 +5005,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -5019,10 +5019,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -5033,13 +5033,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5049,10 +5049,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5063,10 +5063,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5077,10 +5077,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5093,12 +5093,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -5110,11 +5110,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -5126,11 +5126,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5141,11 +5141,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5157,11 +5157,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5175,13 +5175,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5192,11 +5192,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5208,11 +5208,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5226,13 +5226,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5243,11 +5243,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5259,11 +5259,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5277,13 +5277,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5295,12 +5295,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5313,12 +5313,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5331,12 +5331,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5350,13 +5350,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5368,12 +5368,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5386,12 +5386,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5404,12 +5404,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5423,13 +5423,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5444,13 +5444,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5461,10 +5461,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5475,10 +5475,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5489,13 +5489,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5505,10 +5505,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5519,10 +5519,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5533,10 +5533,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5549,12 +5549,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -5566,11 +5566,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -5582,11 +5582,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5597,11 +5597,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5613,11 +5613,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5631,13 +5631,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5648,11 +5648,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5664,11 +5664,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5682,13 +5682,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5699,11 +5699,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5715,11 +5715,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5733,13 +5733,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5751,12 +5751,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5769,12 +5769,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5787,12 +5787,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5806,13 +5806,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5824,12 +5824,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5842,12 +5842,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5860,12 +5860,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5879,13 +5879,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5900,13 +5900,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5917,10 +5917,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5931,10 +5931,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5945,13 +5945,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -5961,10 +5961,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -5975,10 +5975,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -5989,10 +5989,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -6005,12 +6005,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -6022,11 +6022,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -6038,11 +6038,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6053,11 +6053,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6069,11 +6069,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6087,13 +6087,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6104,11 +6104,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6120,11 +6120,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6138,13 +6138,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6155,11 +6155,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6171,11 +6171,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6189,13 +6189,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6207,12 +6207,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6225,12 +6225,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6243,12 +6243,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6262,13 +6262,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6280,12 +6280,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6298,12 +6298,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6316,12 +6316,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6335,13 +6335,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6356,13 +6356,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6373,10 +6373,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6387,10 +6387,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6401,13 +6401,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6417,10 +6417,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6431,10 +6431,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6445,10 +6445,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -6461,12 +6461,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -6478,11 +6478,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -6494,11 +6494,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6509,11 +6509,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6525,11 +6525,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6543,13 +6543,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6560,11 +6560,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6576,11 +6576,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6594,13 +6594,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6611,11 +6611,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6627,11 +6627,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6645,13 +6645,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6663,12 +6663,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6681,12 +6681,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6699,12 +6699,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6718,13 +6718,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6736,12 +6736,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6754,12 +6754,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6772,12 +6772,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6791,13 +6791,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6812,13 +6812,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6829,10 +6829,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6843,10 +6843,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6857,13 +6857,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -6873,10 +6873,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -6887,10 +6887,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -6901,10 +6901,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -6917,12 +6917,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -6934,11 +6934,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -6950,11 +6950,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6965,11 +6965,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6981,11 +6981,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6999,13 +6999,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7016,11 +7016,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7032,11 +7032,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7050,13 +7050,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7067,11 +7067,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7083,11 +7083,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7101,13 +7101,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7119,12 +7119,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7137,12 +7137,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7155,12 +7155,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7174,13 +7174,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7192,12 +7192,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7210,12 +7210,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7228,12 +7228,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7247,13 +7247,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7268,13 +7268,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7285,10 +7285,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7299,10 +7299,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7313,13 +7313,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7329,10 +7329,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7343,10 +7343,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7357,10 +7357,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7373,12 +7373,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -7390,11 +7390,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7406,11 +7406,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7421,11 +7421,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7437,11 +7437,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7455,13 +7455,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7472,11 +7472,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7488,11 +7488,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7506,13 +7506,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7523,11 +7523,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7539,11 +7539,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7557,13 +7557,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7575,12 +7575,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7593,12 +7593,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7611,12 +7611,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7630,13 +7630,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7648,12 +7648,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7666,12 +7666,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7684,12 +7684,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7703,13 +7703,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7724,13 +7724,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7741,10 +7741,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7755,10 +7755,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7769,13 +7769,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7785,10 +7785,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7799,10 +7799,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7813,10 +7813,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7829,12 +7829,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -7846,11 +7846,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7862,11 +7862,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7877,11 +7877,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7893,11 +7893,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7911,13 +7911,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7928,11 +7928,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7944,11 +7944,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7962,13 +7962,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7979,11 +7979,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7995,11 +7995,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8013,13 +8013,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8031,12 +8031,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8049,12 +8049,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8067,12 +8067,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8086,13 +8086,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8104,12 +8104,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8122,12 +8122,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8140,12 +8140,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8159,13 +8159,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8180,13 +8180,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8197,10 +8197,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8211,10 +8211,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8225,13 +8225,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8241,10 +8241,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8255,10 +8255,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8269,10 +8269,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8285,12 +8285,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -8302,11 +8302,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8318,11 +8318,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -8333,11 +8333,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -8349,11 +8349,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8367,13 +8367,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8384,11 +8384,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8400,11 +8400,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8418,13 +8418,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -8435,11 +8435,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -8451,11 +8451,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8469,13 +8469,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8487,12 +8487,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8505,12 +8505,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8523,12 +8523,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8542,13 +8542,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8560,12 +8560,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8578,12 +8578,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8596,12 +8596,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8615,13 +8615,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8636,13 +8636,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8653,10 +8653,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8667,10 +8667,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8681,13 +8681,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8697,10 +8697,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8711,10 +8711,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8725,10 +8725,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8741,12 +8741,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -8758,11 +8758,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8774,11 +8774,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8789,11 +8789,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8805,11 +8805,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8823,13 +8823,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -8840,11 +8840,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -8856,11 +8856,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8874,13 +8874,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -8891,11 +8891,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -8907,11 +8907,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8925,13 +8925,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8943,12 +8943,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8961,12 +8961,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8979,12 +8979,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8998,13 +8998,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9016,12 +9016,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9034,12 +9034,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9052,12 +9052,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9071,13 +9071,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9092,13 +9092,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9109,10 +9109,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9123,10 +9123,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9137,13 +9137,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9153,10 +9153,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9167,10 +9167,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9181,10 +9181,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -9197,12 +9197,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -9214,11 +9214,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -9230,11 +9230,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9245,11 +9245,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9261,11 +9261,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9279,13 +9279,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9296,11 +9296,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9312,11 +9312,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9330,13 +9330,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -9347,11 +9347,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -9363,11 +9363,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9381,13 +9381,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9399,12 +9399,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9417,12 +9417,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9435,12 +9435,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9454,13 +9454,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9472,12 +9472,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9490,12 +9490,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9508,12 +9508,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9527,13 +9527,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9548,13 +9548,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9565,10 +9565,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9579,10 +9579,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9593,13 +9593,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9609,10 +9609,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9623,10 +9623,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9637,10 +9637,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -9653,12 +9653,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -9670,11 +9670,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -9686,11 +9686,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9701,11 +9701,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9717,11 +9717,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9735,13 +9735,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9752,11 +9752,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9768,11 +9768,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9786,13 +9786,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -9803,11 +9803,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -9819,11 +9819,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9837,13 +9837,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9855,12 +9855,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9873,12 +9873,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9891,12 +9891,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9910,13 +9910,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9928,12 +9928,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9946,12 +9946,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9964,12 +9964,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9983,13 +9983,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -10004,13 +10004,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -10021,10 +10021,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -10035,10 +10035,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -10049,13 +10049,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10065,10 +10065,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10079,10 +10079,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10093,10 +10093,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -10109,12 +10109,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -10126,11 +10126,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -10142,11 +10142,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10157,11 +10157,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10173,11 +10173,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10191,13 +10191,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -10208,11 +10208,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -10224,11 +10224,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10242,13 +10242,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10259,11 +10259,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10275,11 +10275,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10293,13 +10293,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10311,12 +10311,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10329,12 +10329,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10347,12 +10347,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10366,13 +10366,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10384,12 +10384,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10402,12 +10402,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10420,12 +10420,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10439,13 +10439,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10460,13 +10460,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10477,10 +10477,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10491,10 +10491,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10505,13 +10505,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10521,10 +10521,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10535,10 +10535,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10549,10 +10549,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -10565,12 +10565,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -10582,11 +10582,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -10598,11 +10598,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10613,11 +10613,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10629,11 +10629,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10647,13 +10647,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -10664,11 +10664,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -10680,11 +10680,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10698,13 +10698,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10715,11 +10715,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10731,11 +10731,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10749,13 +10749,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10767,12 +10767,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10785,12 +10785,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10803,12 +10803,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10822,13 +10822,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10840,12 +10840,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10858,12 +10858,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10876,12 +10876,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10895,13 +10895,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10916,13 +10916,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10933,10 +10933,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10947,10 +10947,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10961,13 +10961,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10977,10 +10977,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10991,10 +10991,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11005,10 +11005,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -11021,12 +11021,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -11038,11 +11038,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -11054,11 +11054,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11069,11 +11069,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11085,11 +11085,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11103,13 +11103,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11120,11 +11120,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11136,11 +11136,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11154,13 +11154,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11171,11 +11171,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11187,11 +11187,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11205,13 +11205,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11223,12 +11223,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11241,12 +11241,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11259,12 +11259,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11278,13 +11278,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11296,12 +11296,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11314,12 +11314,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11332,12 +11332,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11351,13 +11351,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11372,13 +11372,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11389,10 +11389,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11403,10 +11403,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11417,13 +11417,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11433,10 +11433,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11447,10 +11447,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11461,10 +11461,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -11477,12 +11477,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -11494,11 +11494,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -11510,11 +11510,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11525,11 +11525,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11541,11 +11541,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11559,13 +11559,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11576,11 +11576,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11592,11 +11592,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11610,13 +11610,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11627,11 +11627,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11643,11 +11643,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11661,13 +11661,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11679,12 +11679,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11697,12 +11697,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11715,12 +11715,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11734,13 +11734,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11752,12 +11752,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11770,12 +11770,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11788,12 +11788,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11807,13 +11807,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11828,13 +11828,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11845,10 +11845,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11859,10 +11859,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11873,7 +11873,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
@@ -17351,7 +17351,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17361,10 +17361,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17375,10 +17375,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17389,10 +17389,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -17405,12 +17405,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -17422,11 +17422,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -17438,11 +17438,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17453,11 +17453,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17469,11 +17469,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17487,13 +17487,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17504,11 +17504,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17520,11 +17520,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17538,13 +17538,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -17555,11 +17555,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -17571,11 +17571,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17589,13 +17589,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17607,12 +17607,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17625,12 +17625,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17643,12 +17643,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17662,13 +17662,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17680,12 +17680,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17698,12 +17698,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17716,12 +17716,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17735,13 +17735,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17756,13 +17756,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17773,10 +17773,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17787,10 +17787,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17801,13 +17801,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17817,10 +17817,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17831,10 +17831,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17845,10 +17845,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -17861,12 +17861,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -17878,11 +17878,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -17894,11 +17894,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17909,11 +17909,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17925,11 +17925,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17943,13 +17943,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17960,11 +17960,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17976,11 +17976,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17994,13 +17994,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18011,11 +18011,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18027,11 +18027,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18045,13 +18045,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18063,12 +18063,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18081,12 +18081,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18099,12 +18099,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18118,13 +18118,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18136,12 +18136,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18154,12 +18154,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18172,12 +18172,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18191,13 +18191,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18212,13 +18212,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18229,10 +18229,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18243,10 +18243,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18257,13 +18257,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -18273,10 +18273,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -18287,10 +18287,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -18301,10 +18301,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18317,12 +18317,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -18334,11 +18334,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -18350,11 +18350,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18365,11 +18365,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18381,11 +18381,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18399,13 +18399,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18416,11 +18416,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18432,11 +18432,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18450,13 +18450,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18467,11 +18467,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18483,11 +18483,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18501,13 +18501,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18519,12 +18519,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18537,12 +18537,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18555,12 +18555,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18574,13 +18574,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18592,12 +18592,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18610,12 +18610,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18628,12 +18628,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18647,13 +18647,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18668,13 +18668,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -18685,10 +18685,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -18699,10 +18699,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -18713,13 +18713,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -18729,10 +18729,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -18743,10 +18743,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -18757,10 +18757,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18773,12 +18773,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -18790,11 +18790,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -18806,11 +18806,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18821,11 +18821,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18837,11 +18837,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18855,13 +18855,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18872,11 +18872,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18888,11 +18888,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18906,13 +18906,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18923,11 +18923,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18939,11 +18939,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18957,13 +18957,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18975,12 +18975,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18993,12 +18993,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19011,12 +19011,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19030,13 +19030,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19048,12 +19048,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19066,12 +19066,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19084,12 +19084,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19103,13 +19103,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -19124,13 +19124,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -19141,10 +19141,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -19155,10 +19155,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -19169,7 +19169,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
diff --git a/test/qc8-gemm-minmax-fp32.yaml b/test/qc8-gemm-minmax-fp32.yaml
index 638306e..a84880c 100644
--- a/test/qc8-gemm-minmax-fp32.yaml
+++ b/test/qc8-gemm-minmax-fp32.yaml
@@ -3,82 +3,82 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull
   init: xnn_init_qs8_minmax_neon_params
   k-block: 8
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
 - name: xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
@@ -117,16 +117,16 @@
 - name: xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane
   init: xnn_init_qs8_minmax_neon_params
   k-block: 8
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
 - name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot
diff --git a/test/qc8-igemm-minmax-fp32.cc b/test/qc8-igemm-minmax-fp32.cc
index cd5f354..d268580 100644
--- a/test/qc8-igemm-minmax-fp32.cc
+++ b/test/qc8-igemm-minmax-fp32.cc
@@ -23,7 +23,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33,10 +33,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -47,10 +47,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -63,12 +63,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -80,11 +80,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -96,11 +96,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -111,11 +111,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -129,13 +129,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -146,11 +146,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -164,13 +164,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -181,11 +181,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -199,13 +199,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -217,12 +217,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -235,12 +235,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -254,13 +254,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -272,12 +272,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -290,12 +290,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -309,13 +309,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -327,11 +327,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -346,13 +346,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -365,12 +365,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -383,12 +383,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -403,13 +403,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -422,11 +422,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -441,12 +441,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -457,10 +457,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -471,10 +471,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -485,13 +485,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -501,10 +501,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -515,10 +515,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -531,12 +531,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -548,11 +548,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -564,11 +564,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -579,11 +579,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -597,13 +597,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -614,11 +614,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -632,13 +632,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -649,11 +649,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -667,13 +667,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -685,12 +685,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -703,12 +703,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -722,13 +722,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -740,12 +740,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -758,12 +758,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -777,13 +777,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -795,11 +795,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -814,13 +814,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -833,12 +833,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -851,12 +851,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -871,13 +871,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -890,11 +890,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -909,12 +909,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -925,10 +925,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -939,10 +939,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -953,13 +953,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -969,10 +969,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -983,10 +983,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -999,12 +999,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -1016,11 +1016,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1032,11 +1032,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1047,11 +1047,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1065,13 +1065,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1082,11 +1082,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1100,13 +1100,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1117,11 +1117,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1135,13 +1135,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1153,12 +1153,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1171,12 +1171,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1190,13 +1190,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1208,12 +1208,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1226,12 +1226,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1245,13 +1245,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -1263,11 +1263,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1282,13 +1282,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1301,12 +1301,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1319,12 +1319,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1339,13 +1339,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -1358,11 +1358,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1377,12 +1377,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -1393,10 +1393,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -1407,10 +1407,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -1421,13 +1421,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1437,10 +1437,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1451,10 +1451,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1467,12 +1467,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -1484,11 +1484,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1500,11 +1500,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1515,11 +1515,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1533,13 +1533,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1550,11 +1550,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1568,13 +1568,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1585,11 +1585,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1603,13 +1603,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1621,12 +1621,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1639,12 +1639,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1658,13 +1658,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1676,12 +1676,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1694,12 +1694,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1713,13 +1713,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -1731,11 +1731,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1750,13 +1750,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1769,12 +1769,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1787,12 +1787,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1807,13 +1807,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -1826,11 +1826,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1845,12 +1845,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1861,10 +1861,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1875,10 +1875,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1889,13 +1889,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1905,10 +1905,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1919,10 +1919,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1935,12 +1935,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -1952,11 +1952,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1968,11 +1968,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1983,11 +1983,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2001,13 +2001,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2018,11 +2018,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2036,13 +2036,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2053,11 +2053,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2071,13 +2071,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2089,12 +2089,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2107,12 +2107,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2126,13 +2126,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2144,12 +2144,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2162,12 +2162,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2181,13 +2181,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -2199,11 +2199,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2218,13 +2218,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2237,12 +2237,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2255,12 +2255,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2275,13 +2275,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -2294,11 +2294,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2313,12 +2313,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2329,10 +2329,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2343,10 +2343,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2357,13 +2357,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2373,10 +2373,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2387,10 +2387,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -2403,12 +2403,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -2420,11 +2420,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -2436,11 +2436,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2451,11 +2451,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2469,13 +2469,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2486,11 +2486,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2504,13 +2504,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2521,11 +2521,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2539,13 +2539,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2557,12 +2557,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2575,12 +2575,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2594,13 +2594,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2612,12 +2612,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2630,12 +2630,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2649,13 +2649,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -2667,11 +2667,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2686,13 +2686,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2705,12 +2705,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2723,12 +2723,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2743,13 +2743,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -2762,11 +2762,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2781,12 +2781,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2797,10 +2797,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2811,10 +2811,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2825,13 +2825,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -2841,10 +2841,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -2855,10 +2855,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -2871,12 +2871,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -2888,11 +2888,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -2904,11 +2904,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2919,11 +2919,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2937,13 +2937,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2954,11 +2954,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2972,13 +2972,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2989,11 +2989,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3007,13 +3007,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3025,12 +3025,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3043,12 +3043,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3062,13 +3062,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3080,12 +3080,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3098,12 +3098,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3117,13 +3117,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -3135,11 +3135,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3154,13 +3154,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3173,12 +3173,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3191,12 +3191,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3211,13 +3211,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -3230,11 +3230,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3249,12 +3249,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -3265,10 +3265,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -3279,10 +3279,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -3293,13 +3293,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3309,10 +3309,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3323,10 +3323,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3339,12 +3339,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -3356,11 +3356,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3372,11 +3372,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3387,11 +3387,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3405,13 +3405,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3422,11 +3422,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3440,13 +3440,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3457,11 +3457,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3475,13 +3475,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3493,12 +3493,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3511,12 +3511,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3530,13 +3530,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3548,12 +3548,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3566,12 +3566,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3585,13 +3585,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -3603,11 +3603,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3622,13 +3622,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3641,12 +3641,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3659,12 +3659,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3679,13 +3679,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -3698,11 +3698,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3717,12 +3717,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3733,10 +3733,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3747,10 +3747,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3761,13 +3761,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3777,10 +3777,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3791,10 +3791,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3807,12 +3807,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -3824,11 +3824,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3840,11 +3840,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3855,11 +3855,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3873,13 +3873,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3890,11 +3890,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3908,13 +3908,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3925,11 +3925,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3943,13 +3943,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3961,12 +3961,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3979,12 +3979,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3998,13 +3998,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4016,12 +4016,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4034,12 +4034,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4053,13 +4053,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4071,11 +4071,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4090,13 +4090,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4109,12 +4109,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4127,12 +4127,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4147,13 +4147,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4166,11 +4166,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4185,12 +4185,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4201,10 +4201,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4215,10 +4215,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4229,13 +4229,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4245,10 +4245,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4259,10 +4259,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4275,12 +4275,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -4292,11 +4292,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4308,11 +4308,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4323,11 +4323,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4341,13 +4341,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4358,11 +4358,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4376,13 +4376,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4393,11 +4393,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4411,13 +4411,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4429,12 +4429,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4447,12 +4447,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4466,13 +4466,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4484,12 +4484,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4502,12 +4502,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4521,13 +4521,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4539,11 +4539,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4558,13 +4558,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4577,12 +4577,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4595,12 +4595,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4615,13 +4615,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4634,11 +4634,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4653,12 +4653,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4669,10 +4669,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4683,10 +4683,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4697,13 +4697,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -4713,10 +4713,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -4727,10 +4727,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4743,12 +4743,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -4760,11 +4760,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4776,11 +4776,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4791,11 +4791,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4809,13 +4809,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4826,11 +4826,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4844,13 +4844,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4861,11 +4861,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4879,13 +4879,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4897,12 +4897,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4915,12 +4915,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4934,13 +4934,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4952,12 +4952,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4970,12 +4970,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4989,13 +4989,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5007,11 +5007,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5026,13 +5026,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5045,12 +5045,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5063,12 +5063,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5083,13 +5083,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5102,11 +5102,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5121,12 +5121,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -5137,10 +5137,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -5151,10 +5151,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -5165,13 +5165,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5181,10 +5181,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5195,10 +5195,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5211,12 +5211,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -5228,11 +5228,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -5244,11 +5244,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5259,11 +5259,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5277,13 +5277,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5294,11 +5294,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5312,13 +5312,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5329,11 +5329,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5347,13 +5347,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5365,12 +5365,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5383,12 +5383,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5402,13 +5402,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5420,12 +5420,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5438,12 +5438,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5457,13 +5457,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5475,11 +5475,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5494,13 +5494,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5513,12 +5513,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5531,12 +5531,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5551,13 +5551,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5570,11 +5570,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5589,12 +5589,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5605,10 +5605,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5619,10 +5619,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5633,13 +5633,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5649,10 +5649,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5663,10 +5663,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5679,12 +5679,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -5696,11 +5696,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -5712,11 +5712,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5727,11 +5727,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5745,13 +5745,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5762,11 +5762,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5780,13 +5780,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5797,11 +5797,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5815,13 +5815,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5833,12 +5833,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5851,12 +5851,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5870,13 +5870,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5888,12 +5888,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5906,12 +5906,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5925,13 +5925,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5943,11 +5943,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5962,13 +5962,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5981,12 +5981,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5999,12 +5999,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6019,13 +6019,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6038,11 +6038,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6057,12 +6057,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6073,10 +6073,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6087,10 +6087,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6101,13 +6101,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6117,10 +6117,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6131,10 +6131,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -6147,12 +6147,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -6164,11 +6164,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -6180,11 +6180,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6195,11 +6195,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6213,13 +6213,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6230,11 +6230,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6248,13 +6248,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6265,11 +6265,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6283,13 +6283,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6301,12 +6301,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6319,12 +6319,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6338,13 +6338,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6356,12 +6356,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6374,12 +6374,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6393,13 +6393,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6411,11 +6411,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6430,13 +6430,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6449,12 +6449,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6467,12 +6467,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6487,13 +6487,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6506,11 +6506,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6525,12 +6525,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6541,10 +6541,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6555,10 +6555,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6569,13 +6569,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6585,10 +6585,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6599,10 +6599,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -6615,12 +6615,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -6632,11 +6632,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -6648,11 +6648,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6663,11 +6663,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6681,13 +6681,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6698,11 +6698,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6716,13 +6716,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6733,11 +6733,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6751,13 +6751,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6769,12 +6769,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6787,12 +6787,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6806,13 +6806,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6824,12 +6824,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6842,12 +6842,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6861,13 +6861,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6879,11 +6879,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6898,13 +6898,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6917,12 +6917,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6935,12 +6935,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6955,13 +6955,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6974,11 +6974,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6993,12 +6993,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -7009,10 +7009,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -7023,10 +7023,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -7037,13 +7037,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7053,10 +7053,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7067,10 +7067,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7083,12 +7083,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -7100,11 +7100,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7116,11 +7116,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7131,11 +7131,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7149,13 +7149,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7166,11 +7166,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7184,13 +7184,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7201,11 +7201,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7219,13 +7219,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7237,12 +7237,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7255,12 +7255,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7274,13 +7274,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7292,12 +7292,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7310,12 +7310,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7329,13 +7329,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -7347,11 +7347,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7366,13 +7366,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7385,12 +7385,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7403,12 +7403,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7423,13 +7423,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -7442,11 +7442,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7461,12 +7461,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7477,10 +7477,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7491,10 +7491,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7505,13 +7505,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7521,10 +7521,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7535,10 +7535,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7551,12 +7551,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -7568,11 +7568,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7584,11 +7584,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7599,11 +7599,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7617,13 +7617,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7634,11 +7634,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7652,13 +7652,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7669,11 +7669,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7687,13 +7687,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7705,12 +7705,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7723,12 +7723,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7742,13 +7742,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7760,12 +7760,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7778,12 +7778,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7797,13 +7797,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -7815,11 +7815,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7834,13 +7834,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7853,12 +7853,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7871,12 +7871,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7891,13 +7891,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -7910,11 +7910,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7929,12 +7929,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7945,10 +7945,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7959,10 +7959,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7973,13 +7973,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7989,10 +7989,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8003,10 +8003,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8019,12 +8019,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -8036,11 +8036,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8052,11 +8052,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8067,11 +8067,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8085,13 +8085,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -8102,11 +8102,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8120,13 +8120,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -8137,11 +8137,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8155,13 +8155,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8173,12 +8173,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8191,12 +8191,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8210,13 +8210,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8228,12 +8228,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8246,12 +8246,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8265,13 +8265,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -8283,11 +8283,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8302,13 +8302,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8321,12 +8321,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8339,12 +8339,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8359,13 +8359,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -8378,11 +8378,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8397,12 +8397,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8413,10 +8413,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8427,10 +8427,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8441,13 +8441,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8457,10 +8457,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8471,10 +8471,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8487,12 +8487,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -8504,11 +8504,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8520,11 +8520,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8535,11 +8535,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8553,13 +8553,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -8570,11 +8570,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8588,13 +8588,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -8605,11 +8605,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8623,13 +8623,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8641,12 +8641,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8659,12 +8659,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8678,13 +8678,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8696,12 +8696,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8714,12 +8714,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8733,13 +8733,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -8751,11 +8751,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8770,13 +8770,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8789,12 +8789,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8807,12 +8807,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8827,13 +8827,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -8846,11 +8846,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8865,12 +8865,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8881,10 +8881,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8895,10 +8895,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8909,13 +8909,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8925,10 +8925,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8939,10 +8939,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8955,12 +8955,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -8972,11 +8972,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8988,11 +8988,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9003,11 +9003,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9021,13 +9021,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9038,11 +9038,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9056,13 +9056,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -9073,11 +9073,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9091,13 +9091,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9109,12 +9109,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9127,12 +9127,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9146,13 +9146,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9164,12 +9164,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9182,12 +9182,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9201,13 +9201,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -9219,11 +9219,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9238,13 +9238,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9257,12 +9257,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9275,12 +9275,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9295,13 +9295,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -9314,11 +9314,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9333,12 +9333,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9349,10 +9349,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9363,10 +9363,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9377,13 +9377,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9393,10 +9393,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9407,10 +9407,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -9423,12 +9423,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -9440,11 +9440,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -9456,11 +9456,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9471,11 +9471,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9489,13 +9489,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9506,11 +9506,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9524,13 +9524,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -9541,11 +9541,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9559,13 +9559,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9577,12 +9577,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9595,12 +9595,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9614,13 +9614,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9632,12 +9632,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9650,12 +9650,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9669,13 +9669,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -9687,11 +9687,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9706,13 +9706,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9725,12 +9725,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9743,12 +9743,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9763,13 +9763,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -9782,11 +9782,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9801,12 +9801,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9817,10 +9817,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9831,10 +9831,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9845,13 +9845,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9861,10 +9861,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9875,10 +9875,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -9891,12 +9891,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -9908,11 +9908,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -9924,11 +9924,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9939,11 +9939,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9957,13 +9957,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9974,11 +9974,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9992,13 +9992,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10009,11 +10009,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10027,13 +10027,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10045,12 +10045,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10063,12 +10063,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10082,13 +10082,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10100,12 +10100,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10118,12 +10118,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10137,13 +10137,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -10155,11 +10155,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10174,13 +10174,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10193,12 +10193,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10211,12 +10211,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10231,13 +10231,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -10250,11 +10250,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10269,12 +10269,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10285,10 +10285,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10299,10 +10299,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10313,13 +10313,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10329,10 +10329,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10343,10 +10343,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -10359,12 +10359,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -10376,11 +10376,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -10392,11 +10392,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10407,11 +10407,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10425,13 +10425,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -10442,11 +10442,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10460,13 +10460,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10477,11 +10477,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10495,13 +10495,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10513,12 +10513,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10531,12 +10531,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10550,13 +10550,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10568,12 +10568,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10586,12 +10586,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10605,13 +10605,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -10623,11 +10623,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10642,13 +10642,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10661,12 +10661,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10679,12 +10679,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10699,13 +10699,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -10718,11 +10718,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10737,12 +10737,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10753,10 +10753,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10767,10 +10767,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10781,13 +10781,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10797,10 +10797,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10811,10 +10811,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -10827,12 +10827,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -10844,11 +10844,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -10860,11 +10860,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10875,11 +10875,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10893,13 +10893,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -10910,11 +10910,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10928,13 +10928,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10945,11 +10945,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10963,13 +10963,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10981,12 +10981,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10999,12 +10999,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11018,13 +11018,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11036,12 +11036,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11054,12 +11054,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11073,13 +11073,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -11091,11 +11091,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11110,13 +11110,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11129,12 +11129,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11147,12 +11147,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11167,13 +11167,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -11186,11 +11186,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11205,12 +11205,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11221,10 +11221,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11235,10 +11235,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11249,13 +11249,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11265,10 +11265,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11279,10 +11279,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -11295,12 +11295,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -11312,11 +11312,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -11328,11 +11328,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11343,11 +11343,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11361,13 +11361,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11378,11 +11378,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11396,13 +11396,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11413,11 +11413,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11431,13 +11431,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11449,12 +11449,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11467,12 +11467,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11486,13 +11486,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11504,12 +11504,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11522,12 +11522,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11541,13 +11541,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -11559,11 +11559,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11578,13 +11578,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11597,12 +11597,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11615,12 +11615,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11635,13 +11635,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -11654,11 +11654,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11673,12 +11673,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11689,10 +11689,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11703,10 +11703,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11717,7 +11717,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
@@ -15935,7 +15935,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15945,10 +15945,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15959,10 +15959,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -15975,12 +15975,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -15992,11 +15992,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16008,11 +16008,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16023,11 +16023,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16041,13 +16041,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -16058,11 +16058,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16076,13 +16076,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -16093,11 +16093,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16111,13 +16111,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16129,12 +16129,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16147,12 +16147,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16166,13 +16166,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16184,12 +16184,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16202,12 +16202,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16221,13 +16221,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -16239,11 +16239,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16258,13 +16258,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16277,12 +16277,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16295,12 +16295,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16315,13 +16315,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -16334,11 +16334,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16353,12 +16353,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16369,10 +16369,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16383,10 +16383,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16397,13 +16397,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16413,10 +16413,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16427,10 +16427,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -16443,12 +16443,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -16460,11 +16460,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16476,11 +16476,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16491,11 +16491,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16509,13 +16509,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -16526,11 +16526,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16544,13 +16544,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -16561,11 +16561,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16579,13 +16579,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16597,12 +16597,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16615,12 +16615,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16634,13 +16634,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16652,12 +16652,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16670,12 +16670,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16689,13 +16689,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -16707,11 +16707,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16726,13 +16726,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16745,12 +16745,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16763,12 +16763,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16783,13 +16783,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -16802,11 +16802,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16821,12 +16821,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16837,10 +16837,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16851,10 +16851,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16865,13 +16865,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_fp32_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -16881,10 +16881,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -16895,10 +16895,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -16911,12 +16911,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -16928,11 +16928,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16944,11 +16944,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16959,11 +16959,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16977,13 +16977,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -16994,11 +16994,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17012,13 +17012,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -17029,11 +17029,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17047,13 +17047,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17065,12 +17065,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17083,12 +17083,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17102,13 +17102,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17120,12 +17120,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17138,12 +17138,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17157,13 +17157,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -17175,11 +17175,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17194,13 +17194,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17213,12 +17213,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17231,12 +17231,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17251,13 +17251,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -17270,11 +17270,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17289,12 +17289,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -17305,10 +17305,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -17319,10 +17319,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -17333,13 +17333,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -17349,10 +17349,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -17363,10 +17363,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -17379,12 +17379,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -17396,11 +17396,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -17412,11 +17412,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17427,11 +17427,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17445,13 +17445,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17462,11 +17462,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17480,13 +17480,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -17497,11 +17497,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17515,13 +17515,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17533,12 +17533,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17551,12 +17551,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17570,13 +17570,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17588,12 +17588,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17606,12 +17606,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17625,13 +17625,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -17643,11 +17643,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17662,13 +17662,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17681,12 +17681,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17699,12 +17699,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17719,13 +17719,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -17738,11 +17738,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17757,12 +17757,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -17773,10 +17773,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -17787,10 +17787,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -17801,7 +17801,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
diff --git a/test/qc8-igemm-minmax-fp32.yaml b/test/qc8-igemm-minmax-fp32.yaml
index c4d254c..0546c5f 100644
--- a/test/qc8-igemm-minmax-fp32.yaml
+++ b/test/qc8-igemm-minmax-fp32.yaml
@@ -3,79 +3,79 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
 - name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
@@ -105,16 +105,16 @@
 - name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane
   init: xnn_init_qs8_minmax_neon_params
   k-block: 8
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal
   init: xnn_init_qs8_minmax_neon_fp32_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
 - name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot
diff --git a/test/qs8-gemm-minmax-fp32.cc b/test/qs8-gemm-minmax-fp32.cc
index 6034268..35c18ae 100644
--- a/test/qs8-gemm-minmax-fp32.cc
+++ b/test/qs8-gemm-minmax-fp32.cc
@@ -23,7 +23,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33,10 +33,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -47,10 +47,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -61,10 +61,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -77,12 +77,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -94,11 +94,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -110,11 +110,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -125,11 +125,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -141,11 +141,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -159,13 +159,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -176,11 +176,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -192,11 +192,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -210,13 +210,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -227,11 +227,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -243,11 +243,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -261,13 +261,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -279,12 +279,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -297,12 +297,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -315,12 +315,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -334,13 +334,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -352,12 +352,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -370,12 +370,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -388,12 +388,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -407,13 +407,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -428,13 +428,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -445,10 +445,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -459,10 +459,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -473,13 +473,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -489,10 +489,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -503,10 +503,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -517,10 +517,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -533,12 +533,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -550,11 +550,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -566,11 +566,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -581,11 +581,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -597,11 +597,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -615,13 +615,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -632,11 +632,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -648,11 +648,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -666,13 +666,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -683,11 +683,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -699,11 +699,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -717,13 +717,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -735,12 +735,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -753,12 +753,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -771,12 +771,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -790,13 +790,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -808,12 +808,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -826,12 +826,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -844,12 +844,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -863,13 +863,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -884,13 +884,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -901,10 +901,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -915,10 +915,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -929,13 +929,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -945,10 +945,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -959,10 +959,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -973,10 +973,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -989,12 +989,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -1006,11 +1006,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1022,11 +1022,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1037,11 +1037,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1053,11 +1053,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1071,13 +1071,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1088,11 +1088,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1104,11 +1104,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1122,13 +1122,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1139,11 +1139,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1155,11 +1155,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1173,13 +1173,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1191,12 +1191,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1209,12 +1209,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1227,12 +1227,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1246,13 +1246,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1264,12 +1264,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1282,12 +1282,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1300,12 +1300,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1319,13 +1319,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1340,13 +1340,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -1357,10 +1357,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -1371,10 +1371,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -1385,13 +1385,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1401,10 +1401,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1415,10 +1415,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1429,10 +1429,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1445,12 +1445,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -1462,11 +1462,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1478,11 +1478,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1493,11 +1493,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1509,11 +1509,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1527,13 +1527,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1544,11 +1544,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1560,11 +1560,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1578,13 +1578,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1595,11 +1595,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1611,11 +1611,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1629,13 +1629,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1647,12 +1647,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1665,12 +1665,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1683,12 +1683,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1702,13 +1702,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1720,12 +1720,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1738,12 +1738,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1756,12 +1756,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1775,13 +1775,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1796,13 +1796,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1813,10 +1813,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1827,10 +1827,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1841,13 +1841,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1857,10 +1857,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1871,10 +1871,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1885,10 +1885,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1901,12 +1901,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -1918,11 +1918,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1934,11 +1934,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1949,11 +1949,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1965,11 +1965,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1983,13 +1983,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2000,11 +2000,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2016,11 +2016,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2034,13 +2034,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2051,11 +2051,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2067,11 +2067,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2085,13 +2085,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2103,12 +2103,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2121,12 +2121,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2139,12 +2139,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2158,13 +2158,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2176,12 +2176,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2194,12 +2194,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2212,12 +2212,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2231,13 +2231,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2252,13 +2252,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2269,10 +2269,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2283,10 +2283,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2297,13 +2297,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2313,10 +2313,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2327,10 +2327,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2341,10 +2341,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -2357,12 +2357,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -2374,11 +2374,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -2390,11 +2390,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2405,11 +2405,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2421,11 +2421,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2439,13 +2439,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2456,11 +2456,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2472,11 +2472,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2490,13 +2490,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2507,11 +2507,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2523,11 +2523,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2541,13 +2541,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2559,12 +2559,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2577,12 +2577,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2595,12 +2595,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2614,13 +2614,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2632,12 +2632,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2650,12 +2650,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2668,12 +2668,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2687,13 +2687,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2708,13 +2708,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2725,10 +2725,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2739,10 +2739,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2753,13 +2753,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -2769,10 +2769,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -2783,10 +2783,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -2797,10 +2797,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -2813,12 +2813,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -2830,11 +2830,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -2846,11 +2846,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2861,11 +2861,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2877,11 +2877,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2895,13 +2895,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2912,11 +2912,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2928,11 +2928,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2946,13 +2946,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2963,11 +2963,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2979,11 +2979,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2997,13 +2997,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3015,12 +3015,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3033,12 +3033,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3051,12 +3051,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3070,13 +3070,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3088,12 +3088,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3106,12 +3106,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3124,12 +3124,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3143,13 +3143,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3164,13 +3164,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -3181,10 +3181,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -3195,10 +3195,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -3209,13 +3209,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3225,10 +3225,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3239,10 +3239,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3253,10 +3253,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3269,12 +3269,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -3286,11 +3286,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3302,11 +3302,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3317,11 +3317,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3333,11 +3333,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3351,13 +3351,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3368,11 +3368,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3384,11 +3384,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3402,13 +3402,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3419,11 +3419,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3435,11 +3435,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3453,13 +3453,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3471,12 +3471,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3489,12 +3489,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3507,12 +3507,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3526,13 +3526,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3544,12 +3544,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3562,12 +3562,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3580,12 +3580,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3599,13 +3599,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3620,13 +3620,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3637,10 +3637,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3651,10 +3651,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3665,13 +3665,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3681,10 +3681,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3695,10 +3695,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3709,10 +3709,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3725,12 +3725,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -3742,11 +3742,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3758,11 +3758,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3773,11 +3773,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3789,11 +3789,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3807,13 +3807,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3824,11 +3824,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3840,11 +3840,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3858,13 +3858,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3875,11 +3875,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3891,11 +3891,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3909,13 +3909,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3927,12 +3927,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3945,12 +3945,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3963,12 +3963,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3982,13 +3982,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4000,12 +4000,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4018,12 +4018,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4036,12 +4036,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4055,13 +4055,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4076,13 +4076,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4093,10 +4093,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4107,10 +4107,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4121,13 +4121,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4137,10 +4137,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4151,10 +4151,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4165,10 +4165,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4181,12 +4181,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -4198,11 +4198,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4214,11 +4214,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4229,11 +4229,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4245,11 +4245,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4263,13 +4263,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4280,11 +4280,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4296,11 +4296,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4314,13 +4314,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4331,11 +4331,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4347,11 +4347,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4365,13 +4365,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4383,12 +4383,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4401,12 +4401,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4419,12 +4419,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4438,13 +4438,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4456,12 +4456,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4474,12 +4474,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4492,12 +4492,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4511,13 +4511,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4532,13 +4532,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4549,10 +4549,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4563,10 +4563,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4577,13 +4577,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -4593,10 +4593,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -4607,10 +4607,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -4621,10 +4621,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4637,12 +4637,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -4654,11 +4654,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4670,11 +4670,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4685,11 +4685,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4701,11 +4701,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4719,13 +4719,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4736,11 +4736,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4752,11 +4752,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4770,13 +4770,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4787,11 +4787,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4803,11 +4803,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4821,13 +4821,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4839,12 +4839,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4857,12 +4857,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4875,12 +4875,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4894,13 +4894,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4912,12 +4912,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4930,12 +4930,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4948,12 +4948,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4967,13 +4967,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4988,13 +4988,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -5005,10 +5005,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -5019,10 +5019,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -5033,13 +5033,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5049,10 +5049,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5063,10 +5063,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5077,10 +5077,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5093,12 +5093,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -5110,11 +5110,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -5126,11 +5126,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5141,11 +5141,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5157,11 +5157,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5175,13 +5175,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5192,11 +5192,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5208,11 +5208,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5226,13 +5226,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5243,11 +5243,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5259,11 +5259,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5277,13 +5277,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5295,12 +5295,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5313,12 +5313,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5331,12 +5331,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5350,13 +5350,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5368,12 +5368,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5386,12 +5386,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5404,12 +5404,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5423,13 +5423,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5444,13 +5444,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5461,10 +5461,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5475,10 +5475,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5489,13 +5489,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5505,10 +5505,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5519,10 +5519,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5533,10 +5533,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5549,12 +5549,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -5566,11 +5566,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -5582,11 +5582,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5597,11 +5597,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5613,11 +5613,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5631,13 +5631,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5648,11 +5648,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5664,11 +5664,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5682,13 +5682,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5699,11 +5699,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5715,11 +5715,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5733,13 +5733,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5751,12 +5751,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5769,12 +5769,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5787,12 +5787,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5806,13 +5806,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5824,12 +5824,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5842,12 +5842,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5860,12 +5860,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5879,13 +5879,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5900,13 +5900,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5917,10 +5917,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5931,10 +5931,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5945,13 +5945,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -5961,10 +5961,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -5975,10 +5975,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -5989,10 +5989,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -6005,12 +6005,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -6022,11 +6022,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -6038,11 +6038,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6053,11 +6053,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6069,11 +6069,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6087,13 +6087,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6104,11 +6104,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6120,11 +6120,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6138,13 +6138,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6155,11 +6155,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6171,11 +6171,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6189,13 +6189,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6207,12 +6207,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6225,12 +6225,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6243,12 +6243,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6262,13 +6262,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6280,12 +6280,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6298,12 +6298,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6316,12 +6316,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6335,13 +6335,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6356,13 +6356,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6373,10 +6373,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6387,10 +6387,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6401,13 +6401,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6417,10 +6417,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6431,10 +6431,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6445,10 +6445,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -6461,12 +6461,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -6478,11 +6478,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -6494,11 +6494,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6509,11 +6509,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6525,11 +6525,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6543,13 +6543,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6560,11 +6560,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6576,11 +6576,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6594,13 +6594,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6611,11 +6611,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6627,11 +6627,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6645,13 +6645,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6663,12 +6663,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6681,12 +6681,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6699,12 +6699,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6718,13 +6718,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6736,12 +6736,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6754,12 +6754,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6772,12 +6772,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6791,13 +6791,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6812,13 +6812,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6829,10 +6829,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6843,10 +6843,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6857,13 +6857,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -6873,10 +6873,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -6887,10 +6887,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -6901,10 +6901,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -6917,12 +6917,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -6934,11 +6934,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -6950,11 +6950,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6965,11 +6965,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6981,11 +6981,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6999,13 +6999,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7016,11 +7016,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7032,11 +7032,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7050,13 +7050,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7067,11 +7067,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7083,11 +7083,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7101,13 +7101,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7119,12 +7119,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7137,12 +7137,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7155,12 +7155,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7174,13 +7174,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7192,12 +7192,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7210,12 +7210,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7228,12 +7228,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7247,13 +7247,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7268,13 +7268,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7285,10 +7285,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7299,10 +7299,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7313,13 +7313,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7329,10 +7329,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7343,10 +7343,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7357,10 +7357,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7373,12 +7373,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -7390,11 +7390,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7406,11 +7406,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7421,11 +7421,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7437,11 +7437,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7455,13 +7455,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7472,11 +7472,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7488,11 +7488,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7506,13 +7506,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7523,11 +7523,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7539,11 +7539,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7557,13 +7557,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7575,12 +7575,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7593,12 +7593,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7611,12 +7611,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7630,13 +7630,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7648,12 +7648,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7666,12 +7666,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7684,12 +7684,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7703,13 +7703,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7724,13 +7724,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7741,10 +7741,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7755,10 +7755,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7769,13 +7769,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7785,10 +7785,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7799,10 +7799,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7813,10 +7813,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7829,12 +7829,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -7846,11 +7846,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7862,11 +7862,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7877,11 +7877,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7893,11 +7893,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7911,13 +7911,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7928,11 +7928,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7944,11 +7944,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7962,13 +7962,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7979,11 +7979,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7995,11 +7995,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8013,13 +8013,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8031,12 +8031,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8049,12 +8049,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8067,12 +8067,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8086,13 +8086,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8104,12 +8104,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8122,12 +8122,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8140,12 +8140,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8159,13 +8159,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8180,13 +8180,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8197,10 +8197,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8211,10 +8211,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8225,13 +8225,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8241,10 +8241,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8255,10 +8255,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8269,10 +8269,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8285,12 +8285,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -8302,11 +8302,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8318,11 +8318,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -8333,11 +8333,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -8349,11 +8349,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8367,13 +8367,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8384,11 +8384,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8400,11 +8400,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8418,13 +8418,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -8435,11 +8435,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -8451,11 +8451,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8469,13 +8469,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8487,12 +8487,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8505,12 +8505,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8523,12 +8523,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8542,13 +8542,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8560,12 +8560,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8578,12 +8578,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8596,12 +8596,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8615,13 +8615,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8636,13 +8636,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8653,10 +8653,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8667,10 +8667,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8681,13 +8681,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8697,10 +8697,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8711,10 +8711,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8725,10 +8725,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8741,12 +8741,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -8758,11 +8758,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8774,11 +8774,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8789,11 +8789,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8805,11 +8805,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8823,13 +8823,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -8840,11 +8840,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -8856,11 +8856,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8874,13 +8874,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -8891,11 +8891,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -8907,11 +8907,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8925,13 +8925,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8943,12 +8943,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8961,12 +8961,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8979,12 +8979,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8998,13 +8998,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9016,12 +9016,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9034,12 +9034,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9052,12 +9052,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9071,13 +9071,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9092,13 +9092,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9109,10 +9109,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9123,10 +9123,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9137,13 +9137,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9153,10 +9153,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9167,10 +9167,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9181,10 +9181,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -9197,12 +9197,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -9214,11 +9214,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -9230,11 +9230,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9245,11 +9245,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9261,11 +9261,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9279,13 +9279,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9296,11 +9296,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9312,11 +9312,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9330,13 +9330,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -9347,11 +9347,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -9363,11 +9363,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9381,13 +9381,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9399,12 +9399,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9417,12 +9417,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9435,12 +9435,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9454,13 +9454,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9472,12 +9472,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9490,12 +9490,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9508,12 +9508,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9527,13 +9527,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9548,13 +9548,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9565,10 +9565,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9579,10 +9579,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9593,13 +9593,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9609,10 +9609,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9623,10 +9623,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9637,10 +9637,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -9653,12 +9653,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -9670,11 +9670,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -9686,11 +9686,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9701,11 +9701,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9717,11 +9717,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9735,13 +9735,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9752,11 +9752,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9768,11 +9768,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9786,13 +9786,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -9803,11 +9803,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -9819,11 +9819,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9837,13 +9837,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9855,12 +9855,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9873,12 +9873,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9891,12 +9891,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9910,13 +9910,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9928,12 +9928,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9946,12 +9946,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9964,12 +9964,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9983,13 +9983,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -10004,13 +10004,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -10021,10 +10021,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -10035,10 +10035,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -10049,13 +10049,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10065,10 +10065,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10079,10 +10079,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10093,10 +10093,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -10109,12 +10109,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -10126,11 +10126,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -10142,11 +10142,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10157,11 +10157,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10173,11 +10173,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10191,13 +10191,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -10208,11 +10208,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -10224,11 +10224,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10242,13 +10242,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10259,11 +10259,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10275,11 +10275,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10293,13 +10293,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10311,12 +10311,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10329,12 +10329,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10347,12 +10347,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10366,13 +10366,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10384,12 +10384,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10402,12 +10402,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10420,12 +10420,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10439,13 +10439,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10460,13 +10460,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10477,10 +10477,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10491,10 +10491,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10505,13 +10505,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10521,10 +10521,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10535,10 +10535,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10549,10 +10549,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -10565,12 +10565,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -10582,11 +10582,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -10598,11 +10598,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10613,11 +10613,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10629,11 +10629,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10647,13 +10647,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -10664,11 +10664,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -10680,11 +10680,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10698,13 +10698,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10715,11 +10715,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10731,11 +10731,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10749,13 +10749,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10767,12 +10767,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10785,12 +10785,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10803,12 +10803,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10822,13 +10822,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10840,12 +10840,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10858,12 +10858,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10876,12 +10876,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10895,13 +10895,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10916,13 +10916,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10933,10 +10933,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10947,10 +10947,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10961,13 +10961,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10977,10 +10977,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10991,10 +10991,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11005,10 +11005,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -11021,12 +11021,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -11038,11 +11038,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -11054,11 +11054,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11069,11 +11069,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11085,11 +11085,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11103,13 +11103,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11120,11 +11120,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11136,11 +11136,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11154,13 +11154,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11171,11 +11171,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11187,11 +11187,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11205,13 +11205,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11223,12 +11223,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11241,12 +11241,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11259,12 +11259,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11278,13 +11278,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11296,12 +11296,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11314,12 +11314,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11332,12 +11332,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11351,13 +11351,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11372,13 +11372,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11389,10 +11389,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11403,10 +11403,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11417,13 +11417,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11433,10 +11433,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11447,10 +11447,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11461,10 +11461,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -11477,12 +11477,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -11494,11 +11494,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -11510,11 +11510,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11525,11 +11525,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11541,11 +11541,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11559,13 +11559,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11576,11 +11576,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11592,11 +11592,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11610,13 +11610,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11627,11 +11627,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11643,11 +11643,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11661,13 +11661,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11679,12 +11679,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11697,12 +11697,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11715,12 +11715,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11734,13 +11734,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11752,12 +11752,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11770,12 +11770,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11788,12 +11788,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11807,13 +11807,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11828,13 +11828,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11845,10 +11845,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11859,10 +11859,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11873,7 +11873,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
@@ -17351,7 +17351,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17361,10 +17361,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17375,10 +17375,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17389,10 +17389,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -17405,12 +17405,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -17422,11 +17422,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -17438,11 +17438,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17453,11 +17453,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17469,11 +17469,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17487,13 +17487,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17504,11 +17504,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17520,11 +17520,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17538,13 +17538,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -17555,11 +17555,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -17571,11 +17571,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17589,13 +17589,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17607,12 +17607,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17625,12 +17625,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17643,12 +17643,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17662,13 +17662,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17680,12 +17680,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17698,12 +17698,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17716,12 +17716,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17735,13 +17735,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17756,13 +17756,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17773,10 +17773,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17787,10 +17787,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17801,13 +17801,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17817,10 +17817,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17831,10 +17831,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17845,10 +17845,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -17861,12 +17861,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -17878,11 +17878,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -17894,11 +17894,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17909,11 +17909,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17925,11 +17925,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17943,13 +17943,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17960,11 +17960,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17976,11 +17976,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17994,13 +17994,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18011,11 +18011,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18027,11 +18027,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18045,13 +18045,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18063,12 +18063,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18081,12 +18081,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18099,12 +18099,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18118,13 +18118,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18136,12 +18136,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18154,12 +18154,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18172,12 +18172,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18191,13 +18191,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18212,13 +18212,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18229,10 +18229,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18243,10 +18243,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18257,13 +18257,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -18273,10 +18273,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -18287,10 +18287,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -18301,10 +18301,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18317,12 +18317,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -18334,11 +18334,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -18350,11 +18350,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18365,11 +18365,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18381,11 +18381,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18399,13 +18399,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18416,11 +18416,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18432,11 +18432,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18450,13 +18450,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18467,11 +18467,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18483,11 +18483,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18501,13 +18501,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18519,12 +18519,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18537,12 +18537,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18555,12 +18555,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18574,13 +18574,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18592,12 +18592,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18610,12 +18610,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18628,12 +18628,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18647,13 +18647,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18668,13 +18668,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -18685,10 +18685,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -18699,10 +18699,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -18713,13 +18713,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -18729,10 +18729,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -18743,10 +18743,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -18757,10 +18757,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18773,12 +18773,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -18790,11 +18790,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -18806,11 +18806,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18821,11 +18821,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18837,11 +18837,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18855,13 +18855,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18872,11 +18872,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18888,11 +18888,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18906,13 +18906,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18923,11 +18923,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18939,11 +18939,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18957,13 +18957,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18975,12 +18975,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18993,12 +18993,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19011,12 +19011,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19030,13 +19030,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19048,12 +19048,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19066,12 +19066,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19084,12 +19084,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19103,13 +19103,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -19124,13 +19124,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -19141,10 +19141,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -19155,10 +19155,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -19169,7 +19169,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
diff --git a/test/qs8-gemm-minmax-fp32.yaml b/test/qs8-gemm-minmax-fp32.yaml
index 25fd06b..4ddc970 100644
--- a/test/qs8-gemm-minmax-fp32.yaml
+++ b/test/qs8-gemm-minmax-fp32.yaml
@@ -3,82 +3,82 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
 - name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
@@ -117,16 +117,16 @@
 - name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
 - name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neondot
diff --git a/test/qs8-gemm-minmax-gemmlowp.cc b/test/qs8-gemm-minmax-gemmlowp.cc
index bef5741..8df86f6 100644
--- a/test/qs8-gemm-minmax-gemmlowp.cc
+++ b/test/qs8-gemm-minmax-gemmlowp.cc
@@ -23,7 +23,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33,10 +33,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -47,10 +47,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -61,10 +61,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -77,12 +77,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -94,11 +94,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -110,11 +110,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -125,11 +125,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -141,11 +141,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -159,13 +159,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -176,11 +176,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -192,11 +192,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -210,13 +210,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -227,11 +227,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -243,11 +243,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -261,13 +261,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -279,12 +279,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -297,12 +297,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -315,12 +315,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -334,13 +334,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -352,12 +352,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -370,12 +370,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -388,12 +388,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -407,13 +407,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -428,13 +428,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -445,10 +445,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -459,10 +459,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -473,13 +473,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -489,10 +489,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -503,10 +503,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -517,10 +517,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -533,12 +533,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -550,11 +550,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -566,11 +566,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -581,11 +581,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -597,11 +597,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -615,13 +615,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -632,11 +632,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -648,11 +648,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -666,13 +666,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -683,11 +683,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -699,11 +699,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -717,13 +717,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -735,12 +735,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -753,12 +753,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -771,12 +771,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -790,13 +790,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -808,12 +808,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -826,12 +826,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -844,12 +844,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -863,13 +863,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -884,13 +884,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -901,10 +901,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -915,10 +915,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -929,13 +929,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -945,10 +945,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -959,10 +959,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -973,10 +973,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -989,12 +989,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -1006,11 +1006,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1022,11 +1022,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1037,11 +1037,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1053,11 +1053,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1071,13 +1071,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1088,11 +1088,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1104,11 +1104,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1122,13 +1122,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1139,11 +1139,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1155,11 +1155,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1173,13 +1173,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1191,12 +1191,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1209,12 +1209,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1227,12 +1227,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1246,13 +1246,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1264,12 +1264,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1282,12 +1282,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1300,12 +1300,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1319,13 +1319,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1340,13 +1340,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1357,10 +1357,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1371,10 +1371,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1385,13 +1385,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1401,10 +1401,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1415,10 +1415,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1429,10 +1429,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1445,12 +1445,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -1462,11 +1462,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1478,11 +1478,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1493,11 +1493,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1509,11 +1509,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1527,13 +1527,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1544,11 +1544,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1560,11 +1560,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1578,13 +1578,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1595,11 +1595,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1611,11 +1611,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1629,13 +1629,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1647,12 +1647,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1665,12 +1665,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1683,12 +1683,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1702,13 +1702,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1720,12 +1720,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1738,12 +1738,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1756,12 +1756,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1775,13 +1775,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1796,13 +1796,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1813,10 +1813,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1827,10 +1827,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1841,13 +1841,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1857,10 +1857,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1871,10 +1871,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1885,10 +1885,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1901,12 +1901,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -1918,11 +1918,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1934,11 +1934,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1949,11 +1949,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1965,11 +1965,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1983,13 +1983,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2000,11 +2000,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2016,11 +2016,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2034,13 +2034,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2051,11 +2051,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2067,11 +2067,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2085,13 +2085,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2103,12 +2103,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2121,12 +2121,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2139,12 +2139,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2158,13 +2158,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2176,12 +2176,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2194,12 +2194,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2212,12 +2212,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2231,13 +2231,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2252,13 +2252,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2269,10 +2269,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2283,10 +2283,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2297,13 +2297,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2313,10 +2313,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2327,10 +2327,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2341,10 +2341,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -2357,12 +2357,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -2374,11 +2374,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -2390,11 +2390,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2405,11 +2405,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2421,11 +2421,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2439,13 +2439,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2456,11 +2456,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2472,11 +2472,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2490,13 +2490,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2507,11 +2507,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2523,11 +2523,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2541,13 +2541,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2559,12 +2559,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2577,12 +2577,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2595,12 +2595,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2614,13 +2614,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2632,12 +2632,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2650,12 +2650,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2668,12 +2668,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2687,13 +2687,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2708,13 +2708,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2725,10 +2725,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2739,10 +2739,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2753,13 +2753,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2769,10 +2769,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2783,10 +2783,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2797,10 +2797,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -2813,12 +2813,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -2830,11 +2830,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -2846,11 +2846,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2861,11 +2861,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2877,11 +2877,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2895,13 +2895,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2912,11 +2912,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2928,11 +2928,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2946,13 +2946,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2963,11 +2963,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2979,11 +2979,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2997,13 +2997,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3015,12 +3015,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3033,12 +3033,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3051,12 +3051,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3070,13 +3070,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3088,12 +3088,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3106,12 +3106,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3124,12 +3124,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3143,13 +3143,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3164,13 +3164,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -3181,10 +3181,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -3195,10 +3195,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -3209,13 +3209,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -3225,10 +3225,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -3239,10 +3239,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -3253,10 +3253,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3269,12 +3269,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -3286,11 +3286,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3302,11 +3302,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3317,11 +3317,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3333,11 +3333,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3351,13 +3351,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3368,11 +3368,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3384,11 +3384,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3402,13 +3402,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3419,11 +3419,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3435,11 +3435,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3453,13 +3453,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3471,12 +3471,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3489,12 +3489,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3507,12 +3507,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3526,13 +3526,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3544,12 +3544,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3562,12 +3562,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3580,12 +3580,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3599,13 +3599,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3620,13 +3620,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -3637,10 +3637,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -3651,10 +3651,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -3665,13 +3665,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3681,10 +3681,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3695,10 +3695,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3709,10 +3709,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3725,12 +3725,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -3742,11 +3742,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3758,11 +3758,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3773,11 +3773,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3789,11 +3789,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3807,13 +3807,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3824,11 +3824,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3840,11 +3840,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3858,13 +3858,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3875,11 +3875,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3891,11 +3891,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3909,13 +3909,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3927,12 +3927,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3945,12 +3945,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3963,12 +3963,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3982,13 +3982,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4000,12 +4000,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4018,12 +4018,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4036,12 +4036,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4055,13 +4055,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4076,13 +4076,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4093,10 +4093,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4107,10 +4107,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4121,13 +4121,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4137,10 +4137,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4151,10 +4151,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4165,10 +4165,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4181,12 +4181,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -4198,11 +4198,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4214,11 +4214,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4229,11 +4229,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4245,11 +4245,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4263,13 +4263,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4280,11 +4280,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4296,11 +4296,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4314,13 +4314,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4331,11 +4331,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4347,11 +4347,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4365,13 +4365,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4383,12 +4383,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4401,12 +4401,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4419,12 +4419,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4438,13 +4438,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4456,12 +4456,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4474,12 +4474,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4492,12 +4492,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4511,13 +4511,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4532,13 +4532,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4549,10 +4549,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4563,10 +4563,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4577,13 +4577,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4593,10 +4593,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4607,10 +4607,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4621,10 +4621,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4637,12 +4637,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -4654,11 +4654,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4670,11 +4670,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4685,11 +4685,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4701,11 +4701,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4719,13 +4719,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4736,11 +4736,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4752,11 +4752,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4770,13 +4770,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4787,11 +4787,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4803,11 +4803,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4821,13 +4821,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4839,12 +4839,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4857,12 +4857,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4875,12 +4875,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4894,13 +4894,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4912,12 +4912,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4930,12 +4930,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4948,12 +4948,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4967,13 +4967,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4988,13 +4988,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5005,10 +5005,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5019,10 +5019,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5033,13 +5033,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5049,10 +5049,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5063,10 +5063,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5077,10 +5077,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5093,12 +5093,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -5110,11 +5110,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -5126,11 +5126,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5141,11 +5141,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5157,11 +5157,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5175,13 +5175,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5192,11 +5192,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5208,11 +5208,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5226,13 +5226,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5243,11 +5243,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5259,11 +5259,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5277,13 +5277,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5295,12 +5295,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5313,12 +5313,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5331,12 +5331,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5350,13 +5350,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5368,12 +5368,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5386,12 +5386,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5404,12 +5404,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5423,13 +5423,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5444,13 +5444,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5461,10 +5461,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5475,10 +5475,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5489,7 +5489,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
@@ -14159,7 +14159,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14169,10 +14169,10 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14183,10 +14183,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14197,10 +14197,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -14213,12 +14213,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -14230,11 +14230,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -14246,11 +14246,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -14261,11 +14261,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -14277,11 +14277,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14295,13 +14295,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14312,11 +14312,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14328,11 +14328,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14346,13 +14346,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -14363,11 +14363,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -14379,11 +14379,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14397,13 +14397,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14415,12 +14415,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14433,12 +14433,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14451,12 +14451,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14470,13 +14470,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14488,12 +14488,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14506,12 +14506,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14524,12 +14524,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14543,13 +14543,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14564,13 +14564,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14581,10 +14581,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14595,10 +14595,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14609,13 +14609,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14625,10 +14625,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14639,10 +14639,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14653,10 +14653,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -14669,12 +14669,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -14686,11 +14686,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -14702,11 +14702,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -14717,11 +14717,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -14733,11 +14733,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -14751,13 +14751,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14768,11 +14768,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14784,11 +14784,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -14802,13 +14802,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -14819,11 +14819,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -14835,11 +14835,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -14853,13 +14853,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14871,12 +14871,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14889,12 +14889,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14907,12 +14907,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14926,13 +14926,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14944,12 +14944,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14962,12 +14962,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14980,12 +14980,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14999,13 +14999,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15020,13 +15020,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15037,10 +15037,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15051,10 +15051,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15065,13 +15065,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15081,10 +15081,10 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15095,10 +15095,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15109,10 +15109,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -15125,12 +15125,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -15142,11 +15142,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -15158,11 +15158,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15173,11 +15173,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15189,11 +15189,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15207,13 +15207,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15224,11 +15224,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15240,11 +15240,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15258,13 +15258,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15275,11 +15275,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15291,11 +15291,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15309,13 +15309,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15327,12 +15327,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15345,12 +15345,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15363,12 +15363,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15382,13 +15382,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15400,12 +15400,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15418,12 +15418,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15436,12 +15436,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15455,13 +15455,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15476,13 +15476,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15493,10 +15493,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15507,10 +15507,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15521,13 +15521,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -15537,10 +15537,10 @@
       .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -15551,10 +15551,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -15565,10 +15565,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -15581,12 +15581,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -15598,11 +15598,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -15614,11 +15614,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15629,11 +15629,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15645,11 +15645,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -15663,13 +15663,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15680,11 +15680,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15696,11 +15696,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -15714,13 +15714,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15731,11 +15731,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15747,11 +15747,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -15765,13 +15765,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15783,12 +15783,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15801,12 +15801,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15819,12 +15819,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15838,13 +15838,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15856,12 +15856,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15874,12 +15874,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15892,12 +15892,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15911,13 +15911,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -15932,13 +15932,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -15949,10 +15949,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -15963,10 +15963,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -15977,13 +15977,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15993,10 +15993,10 @@
       .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16007,10 +16007,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16021,10 +16021,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -16037,12 +16037,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -16054,11 +16054,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -16070,11 +16070,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16085,11 +16085,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16101,11 +16101,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16119,13 +16119,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16136,11 +16136,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16152,11 +16152,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16170,13 +16170,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -16187,11 +16187,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -16203,11 +16203,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16221,13 +16221,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16239,12 +16239,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16257,12 +16257,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16275,12 +16275,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16294,13 +16294,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16312,12 +16312,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16330,12 +16330,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16348,12 +16348,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16367,13 +16367,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16388,13 +16388,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16405,10 +16405,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16419,10 +16419,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16433,13 +16433,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16449,10 +16449,10 @@
       .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16463,10 +16463,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16477,10 +16477,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -16493,12 +16493,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -16510,11 +16510,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -16526,11 +16526,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16541,11 +16541,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16557,11 +16557,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16575,13 +16575,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16592,11 +16592,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16608,11 +16608,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16626,13 +16626,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -16643,11 +16643,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -16659,11 +16659,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16677,13 +16677,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16695,12 +16695,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16713,12 +16713,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16731,12 +16731,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16750,13 +16750,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16768,12 +16768,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16786,12 +16786,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16804,12 +16804,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16823,13 +16823,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16844,13 +16844,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16861,10 +16861,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16875,10 +16875,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16889,13 +16889,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -16905,10 +16905,10 @@
       .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -16919,10 +16919,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -16933,10 +16933,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -16949,12 +16949,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -16966,11 +16966,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -16982,11 +16982,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16997,11 +16997,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -17013,11 +17013,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17031,13 +17031,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17048,11 +17048,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17064,11 +17064,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17082,13 +17082,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17099,11 +17099,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17115,11 +17115,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17133,13 +17133,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17151,12 +17151,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17169,12 +17169,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17187,12 +17187,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17206,13 +17206,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17224,12 +17224,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17242,12 +17242,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17260,12 +17260,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17279,13 +17279,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17300,13 +17300,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17317,10 +17317,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17331,10 +17331,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17345,13 +17345,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17361,10 +17361,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17375,10 +17375,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17389,10 +17389,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -17405,12 +17405,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -17422,11 +17422,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -17438,11 +17438,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -17453,11 +17453,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -17469,11 +17469,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -17487,13 +17487,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17504,11 +17504,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17520,11 +17520,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -17538,13 +17538,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17555,11 +17555,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17571,11 +17571,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -17589,13 +17589,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17607,12 +17607,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17625,12 +17625,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17643,12 +17643,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17662,13 +17662,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17680,12 +17680,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17698,12 +17698,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17716,12 +17716,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17735,13 +17735,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -17756,13 +17756,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17773,10 +17773,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17787,10 +17787,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17801,13 +17801,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17817,10 +17817,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17831,10 +17831,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17845,10 +17845,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -17861,12 +17861,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -17878,11 +17878,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -17894,11 +17894,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17909,11 +17909,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17925,11 +17925,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17943,13 +17943,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17960,11 +17960,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17976,11 +17976,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17994,13 +17994,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18011,11 +18011,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18027,11 +18027,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18045,13 +18045,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18063,12 +18063,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18081,12 +18081,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18099,12 +18099,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18118,13 +18118,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18136,12 +18136,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18154,12 +18154,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18172,12 +18172,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18191,13 +18191,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18212,13 +18212,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18229,10 +18229,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18243,10 +18243,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18257,13 +18257,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18273,10 +18273,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18287,10 +18287,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18301,10 +18301,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18317,12 +18317,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -18334,11 +18334,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -18350,11 +18350,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18365,11 +18365,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18381,11 +18381,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18399,13 +18399,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18416,11 +18416,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18432,11 +18432,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18450,13 +18450,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18467,11 +18467,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18483,11 +18483,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18501,13 +18501,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18519,12 +18519,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18537,12 +18537,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18555,12 +18555,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18574,13 +18574,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18592,12 +18592,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18610,12 +18610,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18628,12 +18628,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18647,13 +18647,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18668,13 +18668,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18685,10 +18685,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18699,10 +18699,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18713,13 +18713,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -18729,10 +18729,10 @@
       .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -18743,10 +18743,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -18757,10 +18757,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18773,12 +18773,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -18790,11 +18790,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -18806,11 +18806,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18821,11 +18821,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18837,11 +18837,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -18855,13 +18855,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18872,11 +18872,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18888,11 +18888,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -18906,13 +18906,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18923,11 +18923,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18939,11 +18939,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -18957,13 +18957,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18975,12 +18975,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18993,12 +18993,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19011,12 +19011,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19030,13 +19030,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19048,12 +19048,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19066,12 +19066,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19084,12 +19084,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19103,13 +19103,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19124,13 +19124,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19141,10 +19141,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19155,10 +19155,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19169,13 +19169,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19185,10 +19185,10 @@
       .m(4)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19199,10 +19199,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19213,10 +19213,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -19229,12 +19229,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -19246,11 +19246,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -19262,11 +19262,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19277,11 +19277,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19293,11 +19293,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -19311,13 +19311,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -19328,11 +19328,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -19344,11 +19344,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -19362,13 +19362,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -19379,11 +19379,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -19395,11 +19395,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -19413,13 +19413,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19431,12 +19431,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19449,12 +19449,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19467,12 +19467,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19486,13 +19486,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19504,12 +19504,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19522,12 +19522,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19540,12 +19540,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19559,13 +19559,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -19580,13 +19580,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19597,10 +19597,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19611,10 +19611,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19625,13 +19625,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -19641,10 +19641,10 @@
       .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -19655,10 +19655,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -19669,10 +19669,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -19685,12 +19685,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -19702,11 +19702,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -19718,11 +19718,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19733,11 +19733,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19749,11 +19749,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -19767,13 +19767,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -19784,11 +19784,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -19800,11 +19800,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -19818,13 +19818,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -19835,11 +19835,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -19851,11 +19851,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -19869,13 +19869,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19887,12 +19887,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19905,12 +19905,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19923,12 +19923,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19942,13 +19942,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19960,12 +19960,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19978,12 +19978,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19996,12 +19996,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20015,13 +20015,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20036,13 +20036,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20053,10 +20053,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20067,10 +20067,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20081,13 +20081,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20097,10 +20097,10 @@
       .m(2)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20111,10 +20111,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20125,10 +20125,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -20141,12 +20141,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -20158,11 +20158,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -20174,11 +20174,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20189,11 +20189,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20205,11 +20205,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20223,13 +20223,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -20240,11 +20240,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -20256,11 +20256,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20274,13 +20274,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -20291,11 +20291,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -20307,11 +20307,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20325,13 +20325,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20343,12 +20343,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20361,12 +20361,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20379,12 +20379,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20398,13 +20398,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20416,12 +20416,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20434,12 +20434,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20452,12 +20452,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20471,13 +20471,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20492,13 +20492,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20509,10 +20509,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20523,10 +20523,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20537,13 +20537,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -20553,10 +20553,10 @@
       .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -20567,10 +20567,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -20581,10 +20581,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -20597,12 +20597,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -20614,11 +20614,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -20630,11 +20630,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20645,11 +20645,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20661,11 +20661,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -20679,13 +20679,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -20696,11 +20696,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -20712,11 +20712,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -20730,13 +20730,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -20747,11 +20747,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -20763,11 +20763,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -20781,13 +20781,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20799,12 +20799,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20817,12 +20817,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20835,12 +20835,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20854,13 +20854,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20872,12 +20872,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20890,12 +20890,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20908,12 +20908,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20927,13 +20927,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -20948,13 +20948,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -20965,10 +20965,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -20979,10 +20979,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -20993,13 +20993,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21009,10 +21009,10 @@
       .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21023,10 +21023,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21037,10 +21037,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -21053,12 +21053,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -21070,11 +21070,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -21086,11 +21086,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21101,11 +21101,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21117,11 +21117,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21135,13 +21135,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -21152,11 +21152,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -21168,11 +21168,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21186,13 +21186,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -21203,11 +21203,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -21219,11 +21219,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21237,13 +21237,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21255,12 +21255,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21273,12 +21273,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21291,12 +21291,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21310,13 +21310,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21328,12 +21328,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21346,12 +21346,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21364,12 +21364,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21383,13 +21383,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21404,13 +21404,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21421,10 +21421,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21435,10 +21435,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21449,13 +21449,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21465,10 +21465,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21479,10 +21479,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21493,10 +21493,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -21509,12 +21509,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -21526,11 +21526,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -21542,11 +21542,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21557,11 +21557,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21573,11 +21573,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21591,13 +21591,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -21608,11 +21608,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -21624,11 +21624,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21642,13 +21642,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -21659,11 +21659,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -21675,11 +21675,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21693,13 +21693,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21711,12 +21711,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21729,12 +21729,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21747,12 +21747,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21766,13 +21766,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21784,12 +21784,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21802,12 +21802,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21820,12 +21820,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21839,13 +21839,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21860,13 +21860,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21877,10 +21877,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21891,10 +21891,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21905,13 +21905,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -21921,10 +21921,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -21935,10 +21935,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -21949,10 +21949,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -21965,12 +21965,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -21982,11 +21982,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -21998,11 +21998,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22013,11 +22013,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22029,11 +22029,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22047,13 +22047,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -22064,11 +22064,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -22080,11 +22080,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22098,13 +22098,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -22115,11 +22115,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -22131,11 +22131,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22149,13 +22149,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22167,12 +22167,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22185,12 +22185,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22203,12 +22203,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22222,13 +22222,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22240,12 +22240,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22258,12 +22258,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22276,12 +22276,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22295,13 +22295,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22316,13 +22316,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22333,10 +22333,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22347,10 +22347,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22361,13 +22361,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22377,10 +22377,10 @@
       .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22391,10 +22391,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22405,10 +22405,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -22421,12 +22421,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -22438,11 +22438,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -22454,11 +22454,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22469,11 +22469,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22485,11 +22485,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22503,13 +22503,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -22520,11 +22520,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -22536,11 +22536,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22554,13 +22554,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -22571,11 +22571,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -22587,11 +22587,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22605,13 +22605,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22623,12 +22623,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22641,12 +22641,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22659,12 +22659,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22678,13 +22678,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22696,12 +22696,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22714,12 +22714,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22732,12 +22732,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22751,13 +22751,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22772,13 +22772,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22789,10 +22789,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22803,10 +22803,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22817,13 +22817,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -22833,10 +22833,10 @@
       .m(4)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -22847,10 +22847,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -22861,10 +22861,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -22877,12 +22877,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -22894,11 +22894,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -22910,11 +22910,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22925,11 +22925,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22941,11 +22941,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -22959,13 +22959,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -22976,11 +22976,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -22992,11 +22992,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23010,13 +23010,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -23027,11 +23027,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -23043,11 +23043,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23061,13 +23061,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23079,12 +23079,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23097,12 +23097,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23115,12 +23115,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23134,13 +23134,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23152,12 +23152,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23170,12 +23170,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23188,12 +23188,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23207,13 +23207,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23228,13 +23228,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23245,10 +23245,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23259,10 +23259,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23273,13 +23273,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -23289,10 +23289,10 @@
       .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -23303,10 +23303,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -23317,10 +23317,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -23333,12 +23333,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -23350,11 +23350,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -23366,11 +23366,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23381,11 +23381,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23397,11 +23397,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -23415,13 +23415,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -23432,11 +23432,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -23448,11 +23448,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -23466,13 +23466,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -23483,11 +23483,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -23499,11 +23499,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -23517,13 +23517,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23535,12 +23535,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23553,12 +23553,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23571,12 +23571,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23590,13 +23590,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23608,12 +23608,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23626,12 +23626,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23644,12 +23644,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23663,13 +23663,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -23684,13 +23684,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -23701,10 +23701,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -23715,10 +23715,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -23729,13 +23729,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -23745,10 +23745,10 @@
       .m(2)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -23759,10 +23759,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -23773,10 +23773,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -23789,12 +23789,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -23806,11 +23806,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -23822,11 +23822,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23837,11 +23837,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23853,11 +23853,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -23871,13 +23871,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -23888,11 +23888,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -23904,11 +23904,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -23922,13 +23922,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -23939,11 +23939,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -23955,11 +23955,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -23973,13 +23973,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23991,12 +23991,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24009,12 +24009,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24027,12 +24027,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24046,13 +24046,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24064,12 +24064,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24082,12 +24082,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24100,12 +24100,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24119,13 +24119,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -24140,13 +24140,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -24157,10 +24157,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -24171,10 +24171,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -24185,13 +24185,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -24201,10 +24201,10 @@
       .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -24215,10 +24215,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -24229,10 +24229,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -24245,12 +24245,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -24262,11 +24262,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -24278,11 +24278,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24293,11 +24293,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24309,11 +24309,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -24327,13 +24327,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -24344,11 +24344,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -24360,11 +24360,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -24378,13 +24378,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -24395,11 +24395,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -24411,11 +24411,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -24429,13 +24429,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24447,12 +24447,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24465,12 +24465,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24483,12 +24483,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24502,13 +24502,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24520,12 +24520,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24538,12 +24538,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24556,12 +24556,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24575,13 +24575,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -24596,13 +24596,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -24613,10 +24613,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -24627,10 +24627,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -24641,13 +24641,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -24657,10 +24657,10 @@
       .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -24671,10 +24671,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -24685,10 +24685,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -24701,12 +24701,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -24718,11 +24718,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -24734,11 +24734,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24749,11 +24749,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24765,11 +24765,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -24783,13 +24783,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -24800,11 +24800,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -24816,11 +24816,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -24834,13 +24834,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -24851,11 +24851,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -24867,11 +24867,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -24885,13 +24885,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24903,12 +24903,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24921,12 +24921,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24939,12 +24939,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24958,13 +24958,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24976,12 +24976,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -24994,12 +24994,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -25012,12 +25012,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -25031,13 +25031,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -25052,13 +25052,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25069,10 +25069,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25083,10 +25083,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25097,7 +25097,7 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
diff --git a/test/qs8-gemm-minmax-gemmlowp.yaml b/test/qs8-gemm-minmax-gemmlowp.yaml
index 510db1b..9d6d03e 100644
--- a/test/qs8-gemm-minmax-gemmlowp.yaml
+++ b/test/qs8-gemm-minmax-gemmlowp.yaml
@@ -3,40 +3,40 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
@@ -96,76 +96,76 @@
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8__neon_mull_addw_dup
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c4__neondot
diff --git a/test/qs8-gemm-minmax-rndnu.cc b/test/qs8-gemm-minmax-rndnu.cc
index 9c24f56..0c75164 100644
--- a/test/qs8-gemm-minmax-rndnu.cc
+++ b/test/qs8-gemm-minmax-rndnu.cc
@@ -23,7 +23,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33,10 +33,10 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -47,10 +47,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -61,10 +61,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -77,12 +77,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -94,11 +94,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -110,11 +110,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -125,11 +125,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -141,11 +141,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -159,13 +159,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -176,11 +176,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -192,11 +192,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -210,13 +210,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -227,11 +227,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -243,11 +243,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -261,13 +261,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -279,12 +279,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -297,12 +297,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -315,12 +315,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -334,13 +334,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -352,12 +352,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -370,12 +370,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -388,12 +388,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -407,13 +407,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -428,13 +428,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -445,10 +445,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -459,10 +459,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -473,13 +473,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -489,10 +489,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -503,10 +503,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -517,10 +517,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -533,12 +533,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -550,11 +550,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -566,11 +566,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -581,11 +581,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -597,11 +597,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -615,13 +615,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -632,11 +632,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -648,11 +648,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -666,13 +666,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -683,11 +683,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -699,11 +699,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -717,13 +717,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -735,12 +735,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -753,12 +753,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -771,12 +771,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -790,13 +790,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -808,12 +808,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -826,12 +826,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -844,12 +844,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -863,13 +863,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -884,13 +884,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -901,10 +901,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -915,10 +915,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -929,13 +929,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -945,10 +945,10 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -959,10 +959,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -973,10 +973,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -989,12 +989,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -1006,11 +1006,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1022,11 +1022,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1037,11 +1037,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1053,11 +1053,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1071,13 +1071,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1088,11 +1088,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1104,11 +1104,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1122,13 +1122,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -1139,11 +1139,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -1155,11 +1155,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1173,13 +1173,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1191,12 +1191,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1209,12 +1209,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1227,12 +1227,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1246,13 +1246,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1264,12 +1264,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1282,12 +1282,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1300,12 +1300,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1319,13 +1319,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1340,13 +1340,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -1357,10 +1357,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -1371,10 +1371,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -1385,13 +1385,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -1401,10 +1401,10 @@
       .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -1415,10 +1415,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -1429,10 +1429,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1445,12 +1445,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -1462,11 +1462,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1478,11 +1478,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1493,11 +1493,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1509,11 +1509,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1527,13 +1527,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1544,11 +1544,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1560,11 +1560,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1578,13 +1578,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -1595,11 +1595,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -1611,11 +1611,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1629,13 +1629,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1647,12 +1647,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1665,12 +1665,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1683,12 +1683,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1702,13 +1702,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1720,12 +1720,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1738,12 +1738,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1756,12 +1756,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1775,13 +1775,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1796,13 +1796,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -1813,10 +1813,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -1827,10 +1827,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -1841,13 +1841,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1857,10 +1857,10 @@
       .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1871,10 +1871,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1885,10 +1885,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -1901,12 +1901,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -1918,11 +1918,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -1934,11 +1934,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1949,11 +1949,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1965,11 +1965,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1983,13 +1983,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2000,11 +2000,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2016,11 +2016,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2034,13 +2034,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2051,11 +2051,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2067,11 +2067,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2085,13 +2085,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2103,12 +2103,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2121,12 +2121,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2139,12 +2139,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2158,13 +2158,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2176,12 +2176,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2194,12 +2194,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2212,12 +2212,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2231,13 +2231,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2252,13 +2252,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2269,10 +2269,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2283,10 +2283,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2297,13 +2297,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2313,10 +2313,10 @@
       .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2327,10 +2327,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2341,10 +2341,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -2357,12 +2357,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -2374,11 +2374,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -2390,11 +2390,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -2405,11 +2405,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -2421,11 +2421,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2439,13 +2439,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2456,11 +2456,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2472,11 +2472,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2490,13 +2490,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2507,11 +2507,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2523,11 +2523,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2541,13 +2541,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2559,12 +2559,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2577,12 +2577,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2595,12 +2595,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2614,13 +2614,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2632,12 +2632,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2650,12 +2650,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2668,12 +2668,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2687,13 +2687,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2708,13 +2708,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2725,10 +2725,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2739,10 +2739,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2753,13 +2753,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -2769,10 +2769,10 @@
       .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -2783,10 +2783,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -2797,10 +2797,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -2813,12 +2813,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -2830,11 +2830,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -2846,11 +2846,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -2861,11 +2861,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -2877,11 +2877,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -2895,13 +2895,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2912,11 +2912,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2928,11 +2928,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -2946,13 +2946,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2963,11 +2963,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2979,11 +2979,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -2997,13 +2997,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3015,12 +3015,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3033,12 +3033,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3051,12 +3051,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3070,13 +3070,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3088,12 +3088,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3106,12 +3106,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3124,12 +3124,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3143,13 +3143,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -3164,13 +3164,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -3181,10 +3181,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -3195,10 +3195,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -3209,13 +3209,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -3225,10 +3225,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -3239,10 +3239,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -3253,10 +3253,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -3269,12 +3269,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -3286,11 +3286,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -3302,11 +3302,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -3317,11 +3317,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -3333,11 +3333,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3351,13 +3351,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3368,11 +3368,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3384,11 +3384,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3402,13 +3402,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -3419,11 +3419,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -3435,11 +3435,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3453,13 +3453,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3471,12 +3471,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3489,12 +3489,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3507,12 +3507,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3526,13 +3526,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3544,12 +3544,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3562,12 +3562,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3580,12 +3580,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3599,13 +3599,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3620,13 +3620,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -3637,10 +3637,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -3651,10 +3651,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -3665,13 +3665,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3681,10 +3681,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3695,10 +3695,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3709,10 +3709,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3725,12 +3725,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -3742,11 +3742,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3758,11 +3758,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3773,11 +3773,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3789,11 +3789,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3807,13 +3807,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3824,11 +3824,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3840,11 +3840,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3858,13 +3858,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3875,11 +3875,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3891,11 +3891,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3909,13 +3909,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3927,12 +3927,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3945,12 +3945,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3963,12 +3963,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3982,13 +3982,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4000,12 +4000,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4018,12 +4018,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4036,12 +4036,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4055,13 +4055,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4076,13 +4076,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4093,10 +4093,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4107,10 +4107,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4121,13 +4121,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4137,10 +4137,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4151,10 +4151,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4165,10 +4165,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4181,12 +4181,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -4198,11 +4198,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4214,11 +4214,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4229,11 +4229,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4245,11 +4245,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4263,13 +4263,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4280,11 +4280,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4296,11 +4296,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4314,13 +4314,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4331,11 +4331,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4347,11 +4347,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4365,13 +4365,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4383,12 +4383,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4401,12 +4401,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4419,12 +4419,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4438,13 +4438,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4456,12 +4456,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4474,12 +4474,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4492,12 +4492,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4511,13 +4511,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4532,13 +4532,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4549,10 +4549,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4563,10 +4563,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4577,13 +4577,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -4593,10 +4593,10 @@
       .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -4607,10 +4607,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -4621,10 +4621,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4637,12 +4637,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -4654,11 +4654,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4670,11 +4670,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4685,11 +4685,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4701,11 +4701,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -4719,13 +4719,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4736,11 +4736,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4752,11 +4752,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -4770,13 +4770,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4787,11 +4787,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4803,11 +4803,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -4821,13 +4821,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4839,12 +4839,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4857,12 +4857,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4875,12 +4875,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4894,13 +4894,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4912,12 +4912,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4930,12 +4930,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4948,12 +4948,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4967,13 +4967,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -4988,13 +4988,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -5005,10 +5005,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -5019,10 +5019,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -5033,13 +5033,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -5049,10 +5049,10 @@
       .m(4)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -5063,10 +5063,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -5077,10 +5077,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5093,12 +5093,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -5110,11 +5110,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -5126,11 +5126,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5141,11 +5141,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5157,11 +5157,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5175,13 +5175,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5192,11 +5192,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5208,11 +5208,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5226,13 +5226,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5243,11 +5243,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5259,11 +5259,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5277,13 +5277,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5295,12 +5295,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5313,12 +5313,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5331,12 +5331,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5350,13 +5350,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5368,12 +5368,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5386,12 +5386,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5404,12 +5404,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5423,13 +5423,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5444,13 +5444,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -5461,10 +5461,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -5475,10 +5475,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -5489,13 +5489,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5505,10 +5505,10 @@
       .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5519,10 +5519,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5533,10 +5533,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -5549,12 +5549,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -5566,11 +5566,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -5582,11 +5582,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5597,11 +5597,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5613,11 +5613,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5631,13 +5631,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5648,11 +5648,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5664,11 +5664,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5682,13 +5682,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5699,11 +5699,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5715,11 +5715,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5733,13 +5733,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5751,12 +5751,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5769,12 +5769,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5787,12 +5787,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5806,13 +5806,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5824,12 +5824,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5842,12 +5842,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5860,12 +5860,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5879,13 +5879,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5900,13 +5900,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5917,10 +5917,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5931,10 +5931,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5945,13 +5945,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -5961,10 +5961,10 @@
       .m(2)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -5975,10 +5975,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -5989,10 +5989,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -6005,12 +6005,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -6022,11 +6022,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -6038,11 +6038,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6053,11 +6053,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6069,11 +6069,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6087,13 +6087,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6104,11 +6104,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6120,11 +6120,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6138,13 +6138,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6155,11 +6155,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6171,11 +6171,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6189,13 +6189,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6207,12 +6207,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6225,12 +6225,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6243,12 +6243,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6262,13 +6262,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6280,12 +6280,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6298,12 +6298,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6316,12 +6316,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6335,13 +6335,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6356,13 +6356,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6373,10 +6373,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6387,10 +6387,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6401,13 +6401,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -6417,10 +6417,10 @@
       .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -6431,10 +6431,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -6445,10 +6445,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -6461,12 +6461,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -6478,11 +6478,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -6494,11 +6494,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6509,11 +6509,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6525,11 +6525,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6543,13 +6543,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6560,11 +6560,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6576,11 +6576,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6594,13 +6594,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6611,11 +6611,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6627,11 +6627,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6645,13 +6645,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6663,12 +6663,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6681,12 +6681,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6699,12 +6699,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6718,13 +6718,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6736,12 +6736,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6754,12 +6754,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6772,12 +6772,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6791,13 +6791,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6812,13 +6812,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -6829,10 +6829,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -6843,10 +6843,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -6857,13 +6857,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -6873,10 +6873,10 @@
       .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -6887,10 +6887,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -6901,10 +6901,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -6917,12 +6917,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -6934,11 +6934,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -6950,11 +6950,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6965,11 +6965,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6981,11 +6981,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -6999,13 +6999,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7016,11 +7016,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7032,11 +7032,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7050,13 +7050,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7067,11 +7067,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7083,11 +7083,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7101,13 +7101,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7119,12 +7119,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7137,12 +7137,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7155,12 +7155,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7174,13 +7174,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7192,12 +7192,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7210,12 +7210,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7228,12 +7228,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7247,13 +7247,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7268,13 +7268,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7285,10 +7285,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7299,10 +7299,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7313,13 +7313,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -7329,10 +7329,10 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -7343,10 +7343,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -7357,10 +7357,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7373,12 +7373,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -7390,11 +7390,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7406,11 +7406,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -7421,11 +7421,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -7437,11 +7437,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7455,13 +7455,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7472,11 +7472,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7488,11 +7488,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7506,13 +7506,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -7523,11 +7523,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -7539,11 +7539,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7557,13 +7557,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7575,12 +7575,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7593,12 +7593,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7611,12 +7611,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7630,13 +7630,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7648,12 +7648,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7666,12 +7666,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7684,12 +7684,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7703,13 +7703,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7724,13 +7724,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -7741,10 +7741,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -7755,10 +7755,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -7769,13 +7769,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7785,10 +7785,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7799,10 +7799,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7813,10 +7813,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7829,12 +7829,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -7846,11 +7846,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7862,11 +7862,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -7877,11 +7877,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -7893,11 +7893,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7911,13 +7911,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7928,11 +7928,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7944,11 +7944,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7962,13 +7962,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -7979,11 +7979,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -7995,11 +7995,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8013,13 +8013,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8031,12 +8031,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8049,12 +8049,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8067,12 +8067,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8086,13 +8086,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8104,12 +8104,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8122,12 +8122,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8140,12 +8140,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8159,13 +8159,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8180,13 +8180,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8197,10 +8197,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8211,10 +8211,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8225,13 +8225,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -8241,10 +8241,10 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -8255,10 +8255,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -8269,10 +8269,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8285,12 +8285,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -8302,11 +8302,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8318,11 +8318,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -8333,11 +8333,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -8349,11 +8349,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8367,13 +8367,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8384,11 +8384,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8400,11 +8400,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8418,13 +8418,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -8435,11 +8435,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -8451,11 +8451,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8469,13 +8469,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8487,12 +8487,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8505,12 +8505,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8523,12 +8523,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8542,13 +8542,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8560,12 +8560,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8578,12 +8578,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8596,12 +8596,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8615,13 +8615,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8636,13 +8636,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -8653,10 +8653,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -8667,10 +8667,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -8681,13 +8681,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8697,10 +8697,10 @@
       .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8711,10 +8711,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8725,10 +8725,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8741,12 +8741,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -8758,11 +8758,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8774,11 +8774,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -8789,11 +8789,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -8805,11 +8805,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8823,13 +8823,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8840,11 +8840,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8856,11 +8856,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8874,13 +8874,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -8891,11 +8891,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -8907,11 +8907,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8925,13 +8925,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8943,12 +8943,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8961,12 +8961,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8979,12 +8979,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8998,13 +8998,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9016,12 +9016,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9034,12 +9034,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9052,12 +9052,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9071,13 +9071,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9092,13 +9092,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9109,10 +9109,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9123,10 +9123,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9137,13 +9137,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9153,10 +9153,10 @@
       .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9167,10 +9167,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9181,10 +9181,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -9197,12 +9197,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -9214,11 +9214,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -9230,11 +9230,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9245,11 +9245,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9261,11 +9261,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9279,13 +9279,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9296,11 +9296,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9312,11 +9312,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9330,13 +9330,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -9347,11 +9347,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -9363,11 +9363,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9381,13 +9381,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9399,12 +9399,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9417,12 +9417,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9435,12 +9435,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9454,13 +9454,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9472,12 +9472,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9490,12 +9490,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9508,12 +9508,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9527,13 +9527,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9548,13 +9548,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9565,10 +9565,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9579,10 +9579,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9593,13 +9593,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9609,10 +9609,10 @@
       .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9623,10 +9623,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9637,10 +9637,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -9653,12 +9653,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -9670,11 +9670,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -9686,11 +9686,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9701,11 +9701,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9717,11 +9717,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9735,13 +9735,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9752,11 +9752,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9768,11 +9768,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9786,13 +9786,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -9803,11 +9803,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -9819,11 +9819,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9837,13 +9837,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9855,12 +9855,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9873,12 +9873,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9891,12 +9891,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9910,13 +9910,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9928,12 +9928,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9946,12 +9946,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9964,12 +9964,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9983,13 +9983,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -10004,13 +10004,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -10021,10 +10021,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -10035,10 +10035,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -10049,13 +10049,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -10065,10 +10065,10 @@
       .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -10079,10 +10079,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -10093,10 +10093,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -10109,12 +10109,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -10126,11 +10126,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -10142,11 +10142,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -10157,11 +10157,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -10173,11 +10173,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10191,13 +10191,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10208,11 +10208,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10224,11 +10224,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10242,13 +10242,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -10259,11 +10259,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -10275,11 +10275,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10293,13 +10293,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10311,12 +10311,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10329,12 +10329,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10347,12 +10347,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10366,13 +10366,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10384,12 +10384,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10402,12 +10402,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10420,12 +10420,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10439,13 +10439,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10460,13 +10460,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -10477,10 +10477,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -10491,10 +10491,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -10505,13 +10505,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -10521,10 +10521,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -10535,10 +10535,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -10549,10 +10549,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -10565,12 +10565,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -10582,11 +10582,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -10598,11 +10598,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -10613,11 +10613,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -10629,11 +10629,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10647,13 +10647,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10664,11 +10664,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10680,11 +10680,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10698,13 +10698,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -10715,11 +10715,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -10731,11 +10731,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10749,13 +10749,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10767,12 +10767,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10785,12 +10785,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10803,12 +10803,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10822,13 +10822,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10840,12 +10840,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10858,12 +10858,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10876,12 +10876,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10895,13 +10895,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10916,13 +10916,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -10933,10 +10933,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -10947,10 +10947,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -10961,13 +10961,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10977,10 +10977,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10991,10 +10991,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11005,10 +11005,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -11021,12 +11021,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -11038,11 +11038,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -11054,11 +11054,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11069,11 +11069,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11085,11 +11085,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11103,13 +11103,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11120,11 +11120,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11136,11 +11136,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11154,13 +11154,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11171,11 +11171,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11187,11 +11187,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11205,13 +11205,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11223,12 +11223,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11241,12 +11241,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11259,12 +11259,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11278,13 +11278,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11296,12 +11296,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11314,12 +11314,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11332,12 +11332,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11351,13 +11351,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11372,13 +11372,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11389,10 +11389,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11403,10 +11403,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11417,13 +11417,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -11433,10 +11433,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -11447,10 +11447,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -11461,10 +11461,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -11477,12 +11477,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -11494,11 +11494,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -11510,11 +11510,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11525,11 +11525,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11541,11 +11541,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -11559,13 +11559,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11576,11 +11576,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11592,11 +11592,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -11610,13 +11610,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11627,11 +11627,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11643,11 +11643,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -11661,13 +11661,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11679,12 +11679,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11697,12 +11697,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11715,12 +11715,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11734,13 +11734,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11752,12 +11752,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11770,12 +11770,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11788,12 +11788,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11807,13 +11807,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -11828,13 +11828,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -11845,10 +11845,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -11859,10 +11859,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -11873,13 +11873,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -11889,10 +11889,10 @@
       .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -11903,10 +11903,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -11917,10 +11917,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -11933,12 +11933,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -11950,11 +11950,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -11966,11 +11966,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11981,11 +11981,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11997,11 +11997,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12015,13 +12015,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -12032,11 +12032,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -12048,11 +12048,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12066,13 +12066,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -12083,11 +12083,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -12099,11 +12099,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12117,13 +12117,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12135,12 +12135,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12153,12 +12153,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12171,12 +12171,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12190,13 +12190,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12208,12 +12208,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12226,12 +12226,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12244,12 +12244,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12263,13 +12263,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12284,13 +12284,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -12301,10 +12301,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -12315,10 +12315,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -12329,13 +12329,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -12345,10 +12345,10 @@
       .m(4)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -12359,10 +12359,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -12373,10 +12373,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -12389,12 +12389,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -12406,11 +12406,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -12422,11 +12422,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -12437,11 +12437,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -12453,11 +12453,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12471,13 +12471,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -12488,11 +12488,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -12504,11 +12504,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12522,13 +12522,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -12539,11 +12539,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -12555,11 +12555,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12573,13 +12573,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12591,12 +12591,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12609,12 +12609,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12627,12 +12627,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12646,13 +12646,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12664,12 +12664,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12682,12 +12682,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12700,12 +12700,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12719,13 +12719,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12740,13 +12740,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -12757,10 +12757,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -12771,10 +12771,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -12785,13 +12785,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -12801,10 +12801,10 @@
       .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -12815,10 +12815,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -12829,10 +12829,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -12845,12 +12845,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -12862,11 +12862,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -12878,11 +12878,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -12893,11 +12893,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -12909,11 +12909,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12927,13 +12927,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -12944,11 +12944,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -12960,11 +12960,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12978,13 +12978,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -12995,11 +12995,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -13011,11 +13011,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13029,13 +13029,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13047,12 +13047,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13065,12 +13065,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13083,12 +13083,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13102,13 +13102,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13120,12 +13120,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13138,12 +13138,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13156,12 +13156,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13175,13 +13175,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13196,13 +13196,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -13213,10 +13213,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -13227,10 +13227,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -13241,13 +13241,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13257,10 +13257,10 @@
       .m(2)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13271,10 +13271,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13285,10 +13285,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -13301,12 +13301,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -13318,11 +13318,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -13334,11 +13334,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -13349,11 +13349,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -13365,11 +13365,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13383,13 +13383,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -13400,11 +13400,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -13416,11 +13416,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13434,13 +13434,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -13451,11 +13451,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -13467,11 +13467,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13485,13 +13485,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13503,12 +13503,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13521,12 +13521,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13539,12 +13539,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13558,13 +13558,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13576,12 +13576,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13594,12 +13594,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13612,12 +13612,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13631,13 +13631,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13652,13 +13652,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13669,10 +13669,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13683,10 +13683,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13697,13 +13697,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -13713,10 +13713,10 @@
       .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -13727,10 +13727,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -13741,10 +13741,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -13757,12 +13757,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -13774,11 +13774,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -13790,11 +13790,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -13805,11 +13805,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -13821,11 +13821,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -13839,13 +13839,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -13856,11 +13856,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -13872,11 +13872,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -13890,13 +13890,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -13907,11 +13907,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -13923,11 +13923,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -13941,13 +13941,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13959,12 +13959,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13977,12 +13977,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13995,12 +13995,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14014,13 +14014,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14032,12 +14032,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14050,12 +14050,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14068,12 +14068,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14087,13 +14087,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -14108,13 +14108,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -14125,10 +14125,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -14139,10 +14139,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -14153,13 +14153,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14169,10 +14169,10 @@
       .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14183,10 +14183,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14197,10 +14197,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -14213,12 +14213,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -14230,11 +14230,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -14246,11 +14246,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14261,11 +14261,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14277,11 +14277,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14295,13 +14295,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -14312,11 +14312,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -14328,11 +14328,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14346,13 +14346,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -14363,11 +14363,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -14379,11 +14379,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14397,13 +14397,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14415,12 +14415,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14433,12 +14433,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14451,12 +14451,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14470,13 +14470,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14488,12 +14488,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14506,12 +14506,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14524,12 +14524,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14543,13 +14543,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14564,13 +14564,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14581,10 +14581,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14595,10 +14595,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14609,13 +14609,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14625,10 +14625,10 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14639,10 +14639,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14653,10 +14653,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -14669,12 +14669,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -14686,11 +14686,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -14702,11 +14702,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -14717,11 +14717,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -14733,11 +14733,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14751,13 +14751,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14768,11 +14768,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14784,11 +14784,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14802,13 +14802,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -14819,11 +14819,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -14835,11 +14835,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14853,13 +14853,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14871,12 +14871,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14889,12 +14889,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14907,12 +14907,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14926,13 +14926,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14944,12 +14944,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14962,12 +14962,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14980,12 +14980,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14999,13 +14999,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -15020,13 +15020,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15037,10 +15037,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15051,10 +15051,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15065,13 +15065,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15081,10 +15081,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15095,10 +15095,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15109,10 +15109,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -15125,12 +15125,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -15142,11 +15142,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -15158,11 +15158,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15173,11 +15173,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15189,11 +15189,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15207,13 +15207,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15224,11 +15224,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15240,11 +15240,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15258,13 +15258,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15275,11 +15275,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15291,11 +15291,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15309,13 +15309,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15327,12 +15327,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15345,12 +15345,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15363,12 +15363,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15382,13 +15382,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15400,12 +15400,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15418,12 +15418,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15436,12 +15436,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15455,13 +15455,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15476,13 +15476,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15493,10 +15493,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15507,10 +15507,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15521,13 +15521,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15537,10 +15537,10 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15551,10 +15551,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15565,10 +15565,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -15581,12 +15581,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -15598,11 +15598,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -15614,11 +15614,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15629,11 +15629,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15645,11 +15645,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15663,13 +15663,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15680,11 +15680,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15696,11 +15696,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15714,13 +15714,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15731,11 +15731,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15747,11 +15747,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15765,13 +15765,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15783,12 +15783,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15801,12 +15801,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15819,12 +15819,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15838,13 +15838,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15856,12 +15856,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15874,12 +15874,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15892,12 +15892,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15911,13 +15911,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15932,13 +15932,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15949,10 +15949,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15963,10 +15963,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15977,13 +15977,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -15993,10 +15993,10 @@
       .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -16007,10 +16007,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -16021,10 +16021,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -16037,12 +16037,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -16054,11 +16054,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16070,11 +16070,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16085,11 +16085,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16101,11 +16101,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -16119,13 +16119,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16136,11 +16136,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16152,11 +16152,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -16170,13 +16170,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -16187,11 +16187,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -16203,11 +16203,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -16221,13 +16221,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16239,12 +16239,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16257,12 +16257,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16275,12 +16275,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16294,13 +16294,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16312,12 +16312,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16330,12 +16330,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16348,12 +16348,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16367,13 +16367,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -16388,13 +16388,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -16405,10 +16405,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -16419,10 +16419,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -16433,13 +16433,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16449,10 +16449,10 @@
       .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16463,10 +16463,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16477,10 +16477,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -16493,12 +16493,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -16510,11 +16510,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -16526,11 +16526,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16541,11 +16541,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16557,11 +16557,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16575,13 +16575,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16592,11 +16592,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16608,11 +16608,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16626,13 +16626,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -16643,11 +16643,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -16659,11 +16659,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16677,13 +16677,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16695,12 +16695,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16713,12 +16713,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16731,12 +16731,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16750,13 +16750,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16768,12 +16768,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16786,12 +16786,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16804,12 +16804,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16823,13 +16823,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16844,13 +16844,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16861,10 +16861,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16875,10 +16875,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16889,13 +16889,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16905,10 +16905,10 @@
       .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16919,10 +16919,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16933,10 +16933,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -16949,12 +16949,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -16966,11 +16966,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -16982,11 +16982,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16997,11 +16997,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -17013,11 +17013,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17031,13 +17031,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17048,11 +17048,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17064,11 +17064,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17082,13 +17082,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17099,11 +17099,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17115,11 +17115,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17133,13 +17133,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17151,12 +17151,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17169,12 +17169,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17187,12 +17187,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17206,13 +17206,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17224,12 +17224,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17242,12 +17242,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17260,12 +17260,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17279,13 +17279,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17300,13 +17300,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17317,10 +17317,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17331,10 +17331,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17345,13 +17345,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17361,10 +17361,10 @@
       .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17375,10 +17375,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17389,10 +17389,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -17405,12 +17405,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -17422,11 +17422,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -17438,11 +17438,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -17453,11 +17453,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -17469,11 +17469,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17487,13 +17487,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17504,11 +17504,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17520,11 +17520,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17538,13 +17538,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17555,11 +17555,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17571,11 +17571,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17589,13 +17589,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17607,12 +17607,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17625,12 +17625,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17643,12 +17643,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17662,13 +17662,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17680,12 +17680,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17698,12 +17698,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17716,12 +17716,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17735,13 +17735,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17756,13 +17756,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17773,10 +17773,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17787,10 +17787,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17801,13 +17801,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17817,10 +17817,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17831,10 +17831,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17845,10 +17845,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -17861,12 +17861,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -17878,11 +17878,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -17894,11 +17894,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -17909,11 +17909,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -17925,11 +17925,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -17943,13 +17943,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17960,11 +17960,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17976,11 +17976,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -17994,13 +17994,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -18011,11 +18011,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -18027,11 +18027,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -18045,13 +18045,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18063,12 +18063,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18081,12 +18081,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18099,12 +18099,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18118,13 +18118,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18136,12 +18136,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18154,12 +18154,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18172,12 +18172,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18191,13 +18191,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -18212,13 +18212,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -18229,10 +18229,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -18243,10 +18243,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -18257,13 +18257,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18273,10 +18273,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18287,10 +18287,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18301,10 +18301,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18317,12 +18317,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -18334,11 +18334,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -18350,11 +18350,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18365,11 +18365,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18381,11 +18381,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18399,13 +18399,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18416,11 +18416,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18432,11 +18432,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18450,13 +18450,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18467,11 +18467,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18483,11 +18483,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18501,13 +18501,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18519,12 +18519,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18537,12 +18537,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18555,12 +18555,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18574,13 +18574,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18592,12 +18592,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18610,12 +18610,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18628,12 +18628,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18647,13 +18647,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18668,13 +18668,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18685,10 +18685,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18699,10 +18699,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18713,13 +18713,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18729,10 +18729,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18743,10 +18743,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18757,10 +18757,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18773,12 +18773,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -18790,11 +18790,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -18806,11 +18806,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18821,11 +18821,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18837,11 +18837,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18855,13 +18855,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18872,11 +18872,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18888,11 +18888,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18906,13 +18906,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18923,11 +18923,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18939,11 +18939,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18957,13 +18957,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18975,12 +18975,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18993,12 +18993,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19011,12 +19011,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19030,13 +19030,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19048,12 +19048,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19066,12 +19066,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19084,12 +19084,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19103,13 +19103,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -19124,13 +19124,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -19141,10 +19141,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -19155,10 +19155,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -19169,13 +19169,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19185,10 +19185,10 @@
       .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19199,10 +19199,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19213,10 +19213,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -19229,12 +19229,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -19246,11 +19246,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -19262,11 +19262,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19277,11 +19277,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19293,11 +19293,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19311,13 +19311,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -19328,11 +19328,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -19344,11 +19344,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19362,13 +19362,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -19379,11 +19379,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -19395,11 +19395,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19413,13 +19413,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19431,12 +19431,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19449,12 +19449,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19467,12 +19467,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19486,13 +19486,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19504,12 +19504,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19522,12 +19522,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19540,12 +19540,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19559,13 +19559,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19580,13 +19580,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19597,10 +19597,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19611,10 +19611,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19625,13 +19625,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19641,10 +19641,10 @@
       .m(4)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19655,10 +19655,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19669,10 +19669,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -19685,12 +19685,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -19702,11 +19702,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -19718,11 +19718,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19733,11 +19733,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19749,11 +19749,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -19767,13 +19767,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -19784,11 +19784,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -19800,11 +19800,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -19818,13 +19818,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -19835,11 +19835,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -19851,11 +19851,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -19869,13 +19869,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19887,12 +19887,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19905,12 +19905,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19923,12 +19923,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19942,13 +19942,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19960,12 +19960,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19978,12 +19978,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19996,12 +19996,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20015,13 +20015,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -20036,13 +20036,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -20053,10 +20053,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -20067,10 +20067,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -20081,13 +20081,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20097,10 +20097,10 @@
       .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20111,10 +20111,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20125,10 +20125,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -20141,12 +20141,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -20158,11 +20158,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -20174,11 +20174,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20189,11 +20189,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20205,11 +20205,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20223,13 +20223,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -20240,11 +20240,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -20256,11 +20256,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20274,13 +20274,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -20291,11 +20291,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -20307,11 +20307,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20325,13 +20325,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20343,12 +20343,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20361,12 +20361,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20379,12 +20379,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20398,13 +20398,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20416,12 +20416,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20434,12 +20434,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20452,12 +20452,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20471,13 +20471,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20492,13 +20492,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20509,10 +20509,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20523,10 +20523,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20537,13 +20537,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20553,10 +20553,10 @@
       .m(2)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20567,10 +20567,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20581,10 +20581,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -20597,12 +20597,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -20614,11 +20614,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -20630,11 +20630,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20645,11 +20645,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20661,11 +20661,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20679,13 +20679,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -20696,11 +20696,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -20712,11 +20712,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20730,13 +20730,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -20747,11 +20747,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -20763,11 +20763,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20781,13 +20781,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20799,12 +20799,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20817,12 +20817,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20835,12 +20835,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20854,13 +20854,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20872,12 +20872,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20890,12 +20890,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20908,12 +20908,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20927,13 +20927,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20948,13 +20948,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20965,10 +20965,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20979,10 +20979,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20993,13 +20993,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21009,10 +21009,10 @@
       .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21023,10 +21023,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21037,10 +21037,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -21053,12 +21053,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -21070,11 +21070,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -21086,11 +21086,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21101,11 +21101,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21117,11 +21117,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21135,13 +21135,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -21152,11 +21152,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -21168,11 +21168,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21186,13 +21186,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -21203,11 +21203,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -21219,11 +21219,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21237,13 +21237,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21255,12 +21255,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21273,12 +21273,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21291,12 +21291,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21310,13 +21310,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21328,12 +21328,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21346,12 +21346,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21364,12 +21364,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21383,13 +21383,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21404,13 +21404,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21421,10 +21421,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21435,10 +21435,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21449,13 +21449,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21465,10 +21465,10 @@
       .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21479,10 +21479,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21493,10 +21493,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -21509,12 +21509,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -21526,11 +21526,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -21542,11 +21542,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21557,11 +21557,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21573,11 +21573,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21591,13 +21591,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -21608,11 +21608,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -21624,11 +21624,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21642,13 +21642,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -21659,11 +21659,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -21675,11 +21675,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21693,13 +21693,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21711,12 +21711,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21729,12 +21729,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21747,12 +21747,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21766,13 +21766,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21784,12 +21784,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21802,12 +21802,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21820,12 +21820,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21839,13 +21839,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21860,13 +21860,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21877,10 +21877,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21891,10 +21891,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21905,13 +21905,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21921,10 +21921,10 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21935,10 +21935,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21949,10 +21949,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -21965,12 +21965,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -21982,11 +21982,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -21998,11 +21998,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -22013,11 +22013,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -22029,11 +22029,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -22047,13 +22047,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22064,11 +22064,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22080,11 +22080,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -22098,13 +22098,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -22115,11 +22115,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -22131,11 +22131,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -22149,13 +22149,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22167,12 +22167,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22185,12 +22185,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22203,12 +22203,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22222,13 +22222,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22240,12 +22240,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22258,12 +22258,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22276,12 +22276,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22295,13 +22295,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -22316,13 +22316,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -22333,10 +22333,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -22347,10 +22347,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -22361,13 +22361,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22377,10 +22377,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22391,10 +22391,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22405,10 +22405,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -22421,12 +22421,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -22438,11 +22438,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -22454,11 +22454,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -22469,11 +22469,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -22485,11 +22485,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22503,13 +22503,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22520,11 +22520,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22536,11 +22536,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22554,13 +22554,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -22571,11 +22571,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -22587,11 +22587,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22605,13 +22605,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22623,12 +22623,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22641,12 +22641,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22659,12 +22659,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22678,13 +22678,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22696,12 +22696,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22714,12 +22714,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22732,12 +22732,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22751,13 +22751,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22772,13 +22772,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22789,10 +22789,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22803,10 +22803,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22817,13 +22817,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22833,10 +22833,10 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22847,10 +22847,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22861,10 +22861,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -22877,12 +22877,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -22894,11 +22894,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -22910,11 +22910,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -22925,11 +22925,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -22941,11 +22941,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22959,13 +22959,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22976,11 +22976,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22992,11 +22992,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -23010,13 +23010,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -23027,11 +23027,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -23043,11 +23043,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -23061,13 +23061,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23079,12 +23079,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23097,12 +23097,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23115,12 +23115,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23134,13 +23134,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23152,12 +23152,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23170,12 +23170,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23188,12 +23188,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23207,13 +23207,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -23228,13 +23228,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -23245,10 +23245,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -23259,10 +23259,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -23273,13 +23273,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23289,10 +23289,10 @@
       .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23303,10 +23303,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23317,10 +23317,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -23333,12 +23333,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -23350,11 +23350,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -23366,11 +23366,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -23381,11 +23381,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -23397,11 +23397,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23415,13 +23415,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23432,11 +23432,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23448,11 +23448,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23466,13 +23466,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -23483,11 +23483,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -23499,11 +23499,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23517,13 +23517,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23535,12 +23535,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23553,12 +23553,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23571,12 +23571,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23590,13 +23590,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23608,12 +23608,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23626,12 +23626,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23644,12 +23644,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23663,13 +23663,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23684,13 +23684,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23701,10 +23701,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23715,10 +23715,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23729,13 +23729,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -23745,10 +23745,10 @@
       .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -23759,10 +23759,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -23773,10 +23773,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -23789,12 +23789,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -23806,11 +23806,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -23822,11 +23822,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -23837,11 +23837,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -23853,11 +23853,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -23871,13 +23871,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23888,11 +23888,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23904,11 +23904,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -23922,13 +23922,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -23939,11 +23939,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -23955,11 +23955,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -23973,13 +23973,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23991,12 +23991,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24009,12 +24009,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24027,12 +24027,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24046,13 +24046,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24064,12 +24064,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24082,12 +24082,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24100,12 +24100,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24119,13 +24119,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -24140,13 +24140,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -24157,10 +24157,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -24171,10 +24171,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -24185,13 +24185,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -24201,10 +24201,10 @@
       .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -24215,10 +24215,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -24229,10 +24229,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -24245,12 +24245,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -24262,11 +24262,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -24278,11 +24278,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -24293,11 +24293,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -24309,11 +24309,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -24327,13 +24327,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24344,11 +24344,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24360,11 +24360,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -24378,13 +24378,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -24395,11 +24395,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -24411,11 +24411,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -24429,13 +24429,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24447,12 +24447,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24465,12 +24465,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24483,12 +24483,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24502,13 +24502,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24520,12 +24520,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24538,12 +24538,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24556,12 +24556,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24575,13 +24575,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -24596,13 +24596,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -24613,10 +24613,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -24627,10 +24627,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -24641,13 +24641,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -24657,10 +24657,10 @@
       .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -24671,10 +24671,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -24685,10 +24685,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -24701,12 +24701,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -24718,11 +24718,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -24734,11 +24734,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -24749,11 +24749,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -24765,11 +24765,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -24783,13 +24783,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24800,11 +24800,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24816,11 +24816,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -24834,13 +24834,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -24851,11 +24851,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -24867,11 +24867,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -24885,13 +24885,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24903,12 +24903,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24921,12 +24921,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24939,12 +24939,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24958,13 +24958,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24976,12 +24976,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24994,12 +24994,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25012,12 +25012,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25031,13 +25031,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -25052,13 +25052,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -25069,10 +25069,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -25083,10 +25083,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -25097,13 +25097,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25113,10 +25113,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25127,10 +25127,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25141,10 +25141,10 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -25157,12 +25157,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -25174,11 +25174,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -25190,11 +25190,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -25205,11 +25205,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -25221,11 +25221,11 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -25239,13 +25239,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -25256,11 +25256,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -25272,11 +25272,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -25290,13 +25290,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -25307,11 +25307,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -25323,11 +25323,11 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -25341,13 +25341,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25359,12 +25359,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25377,12 +25377,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25395,12 +25395,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25414,13 +25414,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25432,12 +25432,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25450,12 +25450,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25468,12 +25468,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25487,13 +25487,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -25508,13 +25508,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25525,10 +25525,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25539,10 +25539,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25553,13 +25553,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -25569,10 +25569,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -25583,10 +25583,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -25597,10 +25597,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -25613,12 +25613,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -25630,11 +25630,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -25646,11 +25646,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -25661,11 +25661,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -25677,11 +25677,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -25695,13 +25695,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -25712,11 +25712,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -25728,11 +25728,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -25746,13 +25746,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -25763,11 +25763,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -25779,11 +25779,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -25797,13 +25797,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -25815,12 +25815,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -25833,12 +25833,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -25851,12 +25851,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -25870,13 +25870,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -25888,12 +25888,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -25906,12 +25906,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -25924,12 +25924,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -25943,13 +25943,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -25964,13 +25964,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -25981,10 +25981,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -25995,10 +25995,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -26009,13 +26009,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -26025,10 +26025,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -26039,10 +26039,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -26053,10 +26053,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -26069,12 +26069,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -26086,11 +26086,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -26102,11 +26102,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -26117,11 +26117,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -26133,11 +26133,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -26151,13 +26151,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -26168,11 +26168,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -26184,11 +26184,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -26202,13 +26202,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -26219,11 +26219,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -26235,11 +26235,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -26253,13 +26253,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26271,12 +26271,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26289,12 +26289,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26307,12 +26307,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26326,13 +26326,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26344,12 +26344,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26362,12 +26362,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26380,12 +26380,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26399,13 +26399,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -26420,13 +26420,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -26437,10 +26437,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -26451,10 +26451,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -26465,13 +26465,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -26481,10 +26481,10 @@
       .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -26495,10 +26495,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -26509,10 +26509,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -26525,12 +26525,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -26542,11 +26542,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -26558,11 +26558,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -26573,11 +26573,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -26589,11 +26589,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -26607,13 +26607,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -26624,11 +26624,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -26640,11 +26640,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -26658,13 +26658,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -26675,11 +26675,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -26691,11 +26691,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -26709,13 +26709,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26727,12 +26727,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26745,12 +26745,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26763,12 +26763,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26782,13 +26782,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26800,12 +26800,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26818,12 +26818,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26836,12 +26836,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26855,13 +26855,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -26876,13 +26876,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -26893,10 +26893,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -26907,10 +26907,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -26921,13 +26921,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -26937,10 +26937,10 @@
       .m(4)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -26951,10 +26951,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -26965,10 +26965,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -26981,12 +26981,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -26998,11 +26998,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -27014,11 +27014,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -27029,11 +27029,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -27045,11 +27045,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -27063,13 +27063,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -27080,11 +27080,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -27096,11 +27096,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -27114,13 +27114,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -27131,11 +27131,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -27147,11 +27147,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -27165,13 +27165,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27183,12 +27183,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27201,12 +27201,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27219,12 +27219,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27238,13 +27238,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27256,12 +27256,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27274,12 +27274,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27292,12 +27292,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27311,13 +27311,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -27332,13 +27332,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -27349,10 +27349,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -27363,10 +27363,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -27377,13 +27377,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -27393,10 +27393,10 @@
       .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -27407,10 +27407,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -27421,10 +27421,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -27437,12 +27437,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -27454,11 +27454,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -27470,11 +27470,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -27485,11 +27485,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -27501,11 +27501,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -27519,13 +27519,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -27536,11 +27536,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -27552,11 +27552,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -27570,13 +27570,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -27587,11 +27587,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -27603,11 +27603,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -27621,13 +27621,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27639,12 +27639,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27657,12 +27657,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27675,12 +27675,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27694,13 +27694,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27712,12 +27712,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27730,12 +27730,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27748,12 +27748,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27767,13 +27767,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -27788,13 +27788,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -27805,10 +27805,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -27819,10 +27819,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -27833,13 +27833,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -27849,10 +27849,10 @@
       .m(2)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -27863,10 +27863,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -27877,10 +27877,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -27893,12 +27893,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -27910,11 +27910,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -27926,11 +27926,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -27941,11 +27941,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -27957,11 +27957,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -27975,13 +27975,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -27992,11 +27992,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -28008,11 +28008,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -28026,13 +28026,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -28043,11 +28043,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -28059,11 +28059,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -28077,13 +28077,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28095,12 +28095,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28113,12 +28113,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28131,12 +28131,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28150,13 +28150,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28168,12 +28168,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28186,12 +28186,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28204,12 +28204,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28223,13 +28223,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -28244,13 +28244,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -28261,10 +28261,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -28275,10 +28275,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -28289,13 +28289,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -28305,10 +28305,10 @@
       .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -28319,10 +28319,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -28333,10 +28333,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -28349,12 +28349,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -28366,11 +28366,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -28382,11 +28382,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -28397,11 +28397,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -28413,11 +28413,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -28431,13 +28431,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -28448,11 +28448,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -28464,11 +28464,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -28482,13 +28482,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -28499,11 +28499,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -28515,11 +28515,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -28533,13 +28533,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28551,12 +28551,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28569,12 +28569,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28587,12 +28587,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28606,13 +28606,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28624,12 +28624,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28642,12 +28642,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28660,12 +28660,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28679,13 +28679,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -28700,13 +28700,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -28717,10 +28717,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -28731,10 +28731,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -28745,13 +28745,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -28761,10 +28761,10 @@
       .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -28775,10 +28775,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -28789,10 +28789,10 @@
       .n(16)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -28805,12 +28805,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -28822,11 +28822,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -28838,11 +28838,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -28853,11 +28853,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -28869,11 +28869,11 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -28887,13 +28887,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -28904,11 +28904,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -28920,11 +28920,11 @@
         .n(16)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -28938,13 +28938,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -28955,11 +28955,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -28971,11 +28971,11 @@
         .n(16)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -28989,13 +28989,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29007,12 +29007,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29025,12 +29025,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29043,12 +29043,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29062,13 +29062,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29080,12 +29080,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29098,12 +29098,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29116,12 +29116,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29135,13 +29135,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -29156,13 +29156,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -29173,10 +29173,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -29187,10 +29187,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -29201,13 +29201,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29217,10 +29217,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29231,10 +29231,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29245,10 +29245,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -29261,12 +29261,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -29278,11 +29278,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -29294,11 +29294,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -29309,11 +29309,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -29325,11 +29325,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -29343,13 +29343,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -29360,11 +29360,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -29376,11 +29376,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -29394,13 +29394,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -29411,11 +29411,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -29427,11 +29427,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -29445,13 +29445,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29463,12 +29463,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29481,12 +29481,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29499,12 +29499,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29518,13 +29518,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29536,12 +29536,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29554,12 +29554,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29572,12 +29572,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29591,13 +29591,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -29612,13 +29612,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29629,10 +29629,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29643,10 +29643,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29657,13 +29657,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29673,10 +29673,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29687,10 +29687,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29701,10 +29701,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -29717,12 +29717,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -29734,11 +29734,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -29750,11 +29750,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -29765,11 +29765,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -29781,11 +29781,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -29799,13 +29799,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -29816,11 +29816,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -29832,11 +29832,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -29850,13 +29850,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -29867,11 +29867,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -29883,11 +29883,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -29901,13 +29901,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29919,12 +29919,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29937,12 +29937,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29955,12 +29955,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29974,13 +29974,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29992,12 +29992,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30010,12 +30010,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30028,12 +30028,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30047,13 +30047,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30068,13 +30068,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30085,10 +30085,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30099,10 +30099,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30113,13 +30113,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30129,10 +30129,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30143,10 +30143,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_eq_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30157,10 +30157,10 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -30173,12 +30173,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -30190,11 +30190,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -30206,11 +30206,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -30221,11 +30221,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -30237,11 +30237,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30255,13 +30255,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -30272,11 +30272,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -30288,11 +30288,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30306,13 +30306,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -30323,11 +30323,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -30339,11 +30339,11 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30357,13 +30357,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -30375,12 +30375,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -30393,12 +30393,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -30411,12 +30411,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -30430,13 +30430,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -30448,12 +30448,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -30466,12 +30466,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -30484,12 +30484,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -30503,13 +30503,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30524,13 +30524,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30541,10 +30541,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30555,10 +30555,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30569,13 +30569,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30585,10 +30585,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30599,10 +30599,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30613,10 +30613,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -30629,12 +30629,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -30646,11 +30646,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -30662,11 +30662,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -30677,11 +30677,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -30693,11 +30693,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30711,13 +30711,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -30728,11 +30728,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -30744,11 +30744,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30762,13 +30762,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -30779,11 +30779,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -30795,11 +30795,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30813,13 +30813,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30831,12 +30831,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30849,12 +30849,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30867,12 +30867,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30886,13 +30886,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30904,12 +30904,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30922,12 +30922,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30940,12 +30940,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30959,13 +30959,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30980,13 +30980,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30997,10 +30997,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31011,10 +31011,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31025,13 +31025,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31041,10 +31041,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31055,10 +31055,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31069,10 +31069,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -31085,12 +31085,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -31102,11 +31102,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -31118,11 +31118,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -31133,11 +31133,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -31149,11 +31149,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31167,13 +31167,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -31184,11 +31184,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -31200,11 +31200,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31218,13 +31218,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -31235,11 +31235,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -31251,11 +31251,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31269,13 +31269,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31287,12 +31287,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31305,12 +31305,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31323,12 +31323,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31342,13 +31342,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31360,12 +31360,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31378,12 +31378,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31396,12 +31396,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31415,13 +31415,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31436,13 +31436,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31453,10 +31453,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31467,10 +31467,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31481,13 +31481,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31497,10 +31497,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31511,10 +31511,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31525,10 +31525,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -31541,12 +31541,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -31558,11 +31558,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -31574,11 +31574,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -31589,11 +31589,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -31605,11 +31605,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31623,13 +31623,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -31640,11 +31640,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -31656,11 +31656,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31674,13 +31674,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -31691,11 +31691,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -31707,11 +31707,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31725,13 +31725,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31743,12 +31743,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31761,12 +31761,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31779,12 +31779,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31798,13 +31798,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31816,12 +31816,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31834,12 +31834,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31852,12 +31852,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31871,13 +31871,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31892,13 +31892,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31909,10 +31909,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31923,10 +31923,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31937,13 +31937,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -31953,10 +31953,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -31967,10 +31967,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -31981,10 +31981,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -31997,12 +31997,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -32014,11 +32014,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -32030,11 +32030,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -32045,11 +32045,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -32061,11 +32061,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32079,13 +32079,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -32096,11 +32096,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -32112,11 +32112,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32130,13 +32130,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -32147,11 +32147,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -32163,11 +32163,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32181,13 +32181,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32199,12 +32199,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32217,12 +32217,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32235,12 +32235,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32254,13 +32254,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32272,12 +32272,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32290,12 +32290,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32308,12 +32308,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32327,13 +32327,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32348,13 +32348,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32365,10 +32365,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32379,10 +32379,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32393,13 +32393,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32409,10 +32409,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32423,10 +32423,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32437,10 +32437,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -32453,12 +32453,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -32470,11 +32470,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -32486,11 +32486,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -32501,11 +32501,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -32517,11 +32517,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32535,13 +32535,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -32552,11 +32552,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -32568,11 +32568,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32586,13 +32586,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -32603,11 +32603,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -32619,11 +32619,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32637,13 +32637,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32655,12 +32655,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32673,12 +32673,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32691,12 +32691,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32710,13 +32710,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32728,12 +32728,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32746,12 +32746,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32764,12 +32764,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32783,13 +32783,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32804,13 +32804,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32821,10 +32821,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32835,10 +32835,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32849,13 +32849,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32865,10 +32865,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32879,10 +32879,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32893,10 +32893,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -32909,12 +32909,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -32926,11 +32926,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -32942,11 +32942,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -32957,11 +32957,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -32973,11 +32973,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32991,13 +32991,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -33008,11 +33008,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -33024,11 +33024,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33042,13 +33042,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -33059,11 +33059,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -33075,11 +33075,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33093,13 +33093,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33111,12 +33111,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33129,12 +33129,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33147,12 +33147,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33166,13 +33166,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33184,12 +33184,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33202,12 +33202,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33220,12 +33220,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33239,13 +33239,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33260,13 +33260,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33277,10 +33277,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33291,10 +33291,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33305,13 +33305,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33321,10 +33321,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33335,10 +33335,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33349,10 +33349,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -33365,12 +33365,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -33382,11 +33382,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -33398,11 +33398,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -33413,11 +33413,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -33429,11 +33429,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33447,13 +33447,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -33464,11 +33464,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -33480,11 +33480,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33498,13 +33498,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -33515,11 +33515,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -33531,11 +33531,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33549,13 +33549,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33567,12 +33567,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33585,12 +33585,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33603,12 +33603,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33622,13 +33622,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33640,12 +33640,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33658,12 +33658,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33676,12 +33676,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33695,13 +33695,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33716,13 +33716,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33733,10 +33733,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33747,10 +33747,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33761,7 +33761,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
@@ -37415,7 +37415,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -37425,10 +37425,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -37439,10 +37439,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -37453,10 +37453,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -37469,12 +37469,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -37486,11 +37486,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -37502,11 +37502,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -37517,11 +37517,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -37533,11 +37533,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -37551,13 +37551,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -37568,11 +37568,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -37584,11 +37584,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -37602,13 +37602,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -37619,11 +37619,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -37635,11 +37635,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -37653,13 +37653,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37671,12 +37671,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37689,12 +37689,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37707,12 +37707,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37726,13 +37726,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37744,12 +37744,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37762,12 +37762,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37780,12 +37780,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37799,13 +37799,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -37820,13 +37820,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -37837,10 +37837,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -37851,10 +37851,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -37865,13 +37865,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -37881,10 +37881,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -37895,10 +37895,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -37909,10 +37909,10 @@
       .n(8)
       .k(16)
       .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -37925,12 +37925,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -37942,11 +37942,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -37958,11 +37958,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -37973,11 +37973,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_lt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -37989,11 +37989,11 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -38007,13 +38007,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -38024,11 +38024,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_gt_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -38040,11 +38040,11 @@
         .n(8)
         .k(k)
         .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -38058,13 +38058,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -38075,11 +38075,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_div_16_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -38091,11 +38091,11 @@
         .n(8)
         .k(k)
         .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -38109,13 +38109,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -38127,12 +38127,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -38145,12 +38145,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -38163,12 +38163,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -38182,13 +38182,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -38200,12 +38200,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -38218,12 +38218,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -38236,12 +38236,12 @@
           .n(n)
           .k(k)
           .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -38255,13 +38255,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -38276,13 +38276,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -38293,10 +38293,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -38307,10 +38307,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -38321,7 +38321,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
diff --git a/test/qs8-gemm-minmax-rndnu.yaml b/test/qs8-gemm-minmax-rndnu.yaml
index 9427b75..81ae311 100644
--- a/test/qs8-gemm-minmax-rndnu.yaml
+++ b/test/qs8-gemm-minmax-rndnu.yaml
@@ -3,226 +3,226 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
 - name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
@@ -249,10 +249,10 @@
 - name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
 - name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot
diff --git a/test/qs8-igemm-minmax-fp32.cc b/test/qs8-igemm-minmax-fp32.cc
index 78f69ec..f90996e 100644
--- a/test/qs8-igemm-minmax-fp32.cc
+++ b/test/qs8-igemm-minmax-fp32.cc
@@ -23,7 +23,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33,10 +33,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -47,10 +47,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -63,12 +63,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -80,11 +80,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -96,11 +96,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -111,11 +111,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -129,13 +129,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -146,11 +146,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -164,13 +164,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -181,11 +181,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -199,13 +199,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -217,12 +217,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -235,12 +235,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -254,13 +254,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -272,12 +272,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -290,12 +290,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -309,13 +309,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -327,11 +327,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -346,13 +346,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -365,12 +365,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -383,12 +383,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -403,13 +403,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -422,11 +422,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -441,12 +441,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -457,10 +457,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -471,10 +471,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -485,13 +485,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -501,10 +501,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -515,10 +515,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -531,12 +531,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -548,11 +548,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -564,11 +564,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -579,11 +579,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -597,13 +597,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -614,11 +614,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -632,13 +632,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -649,11 +649,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -667,13 +667,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -685,12 +685,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -703,12 +703,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -722,13 +722,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -740,12 +740,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -758,12 +758,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -777,13 +777,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -795,11 +795,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -814,13 +814,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -833,12 +833,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -851,12 +851,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -871,13 +871,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -890,11 +890,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -909,12 +909,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -925,10 +925,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -939,10 +939,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -953,13 +953,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -969,10 +969,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -983,10 +983,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -999,12 +999,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -1016,11 +1016,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1032,11 +1032,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1047,11 +1047,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1065,13 +1065,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1082,11 +1082,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1100,13 +1100,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1117,11 +1117,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1135,13 +1135,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1153,12 +1153,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1171,12 +1171,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1190,13 +1190,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1208,12 +1208,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1226,12 +1226,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1245,13 +1245,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -1263,11 +1263,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1282,13 +1282,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1301,12 +1301,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1319,12 +1319,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1339,13 +1339,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -1358,11 +1358,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1377,12 +1377,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -1393,10 +1393,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -1407,10 +1407,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4S2__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -1421,13 +1421,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1437,10 +1437,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1451,10 +1451,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1467,12 +1467,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -1484,11 +1484,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1500,11 +1500,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1515,11 +1515,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1533,13 +1533,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1550,11 +1550,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1568,13 +1568,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1585,11 +1585,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1603,13 +1603,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1621,12 +1621,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1639,12 +1639,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1658,13 +1658,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1676,12 +1676,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1694,12 +1694,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1713,13 +1713,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -1731,11 +1731,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1750,13 +1750,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1769,12 +1769,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1787,12 +1787,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1807,13 +1807,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -1826,11 +1826,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1845,12 +1845,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1861,10 +1861,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1875,10 +1875,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -1889,13 +1889,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1905,10 +1905,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1919,10 +1919,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1935,12 +1935,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -1952,11 +1952,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1968,11 +1968,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1983,11 +1983,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2001,13 +2001,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2018,11 +2018,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2036,13 +2036,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2053,11 +2053,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2071,13 +2071,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2089,12 +2089,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2107,12 +2107,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2126,13 +2126,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2144,12 +2144,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2162,12 +2162,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2181,13 +2181,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -2199,11 +2199,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2218,13 +2218,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2237,12 +2237,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2255,12 +2255,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2275,13 +2275,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -2294,11 +2294,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2313,12 +2313,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2329,10 +2329,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2343,10 +2343,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2357,13 +2357,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2373,10 +2373,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2387,10 +2387,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -2403,12 +2403,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -2420,11 +2420,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -2436,11 +2436,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2451,11 +2451,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2469,13 +2469,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2486,11 +2486,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2504,13 +2504,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2521,11 +2521,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2539,13 +2539,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2557,12 +2557,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2575,12 +2575,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2594,13 +2594,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2612,12 +2612,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2630,12 +2630,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2649,13 +2649,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -2667,11 +2667,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2686,13 +2686,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2705,12 +2705,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2723,12 +2723,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2743,13 +2743,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -2762,11 +2762,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2781,12 +2781,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2797,10 +2797,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2811,10 +2811,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2825,13 +2825,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -2841,10 +2841,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -2855,10 +2855,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -2871,12 +2871,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -2888,11 +2888,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -2904,11 +2904,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2919,11 +2919,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2937,13 +2937,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2954,11 +2954,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2972,13 +2972,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2989,11 +2989,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3007,13 +3007,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3025,12 +3025,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3043,12 +3043,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3062,13 +3062,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3080,12 +3080,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3098,12 +3098,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3117,13 +3117,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -3135,11 +3135,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3154,13 +3154,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3173,12 +3173,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3191,12 +3191,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3211,13 +3211,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -3230,11 +3230,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3249,12 +3249,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -3265,10 +3265,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -3279,10 +3279,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2S4__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -3293,13 +3293,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3309,10 +3309,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3323,10 +3323,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3339,12 +3339,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -3356,11 +3356,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3372,11 +3372,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3387,11 +3387,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3405,13 +3405,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3422,11 +3422,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3440,13 +3440,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3457,11 +3457,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3475,13 +3475,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3493,12 +3493,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3511,12 +3511,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3530,13 +3530,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3548,12 +3548,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3566,12 +3566,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3585,13 +3585,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -3603,11 +3603,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3622,13 +3622,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3641,12 +3641,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3659,12 +3659,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3679,13 +3679,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -3698,11 +3698,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3717,12 +3717,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3733,10 +3733,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3747,10 +3747,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -3761,13 +3761,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3777,10 +3777,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3791,10 +3791,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3807,12 +3807,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -3824,11 +3824,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3840,11 +3840,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3855,11 +3855,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3873,13 +3873,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3890,11 +3890,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3908,13 +3908,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3925,11 +3925,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3943,13 +3943,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3961,12 +3961,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3979,12 +3979,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3998,13 +3998,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4016,12 +4016,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4034,12 +4034,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4053,13 +4053,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4071,11 +4071,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4090,13 +4090,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4109,12 +4109,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4127,12 +4127,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4147,13 +4147,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4166,11 +4166,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4185,12 +4185,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4201,10 +4201,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4215,10 +4215,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4229,13 +4229,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4245,10 +4245,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4259,10 +4259,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4275,12 +4275,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -4292,11 +4292,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4308,11 +4308,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4323,11 +4323,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4341,13 +4341,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4358,11 +4358,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4376,13 +4376,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4393,11 +4393,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4411,13 +4411,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4429,12 +4429,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4447,12 +4447,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4466,13 +4466,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4484,12 +4484,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4502,12 +4502,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4521,13 +4521,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4539,11 +4539,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4558,13 +4558,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4577,12 +4577,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4595,12 +4595,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4615,13 +4615,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4634,11 +4634,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4653,12 +4653,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4669,10 +4669,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4683,10 +4683,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4697,13 +4697,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -4713,10 +4713,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -4727,10 +4727,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4743,12 +4743,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -4760,11 +4760,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4776,11 +4776,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4791,11 +4791,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4809,13 +4809,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4826,11 +4826,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4844,13 +4844,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4861,11 +4861,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4879,13 +4879,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4897,12 +4897,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4915,12 +4915,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4934,13 +4934,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4952,12 +4952,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4970,12 +4970,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4989,13 +4989,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5007,11 +5007,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5026,13 +5026,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5045,12 +5045,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5063,12 +5063,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5083,13 +5083,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5102,11 +5102,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5121,12 +5121,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -5137,10 +5137,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -5151,10 +5151,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -5165,13 +5165,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5181,10 +5181,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5195,10 +5195,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5211,12 +5211,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -5228,11 +5228,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -5244,11 +5244,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5259,11 +5259,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5277,13 +5277,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5294,11 +5294,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5312,13 +5312,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5329,11 +5329,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5347,13 +5347,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5365,12 +5365,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5383,12 +5383,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5402,13 +5402,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5420,12 +5420,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5438,12 +5438,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5457,13 +5457,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5475,11 +5475,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5494,13 +5494,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5513,12 +5513,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5531,12 +5531,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5551,13 +5551,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5570,11 +5570,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5589,12 +5589,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5605,10 +5605,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5619,10 +5619,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -5633,13 +5633,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5649,10 +5649,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5663,10 +5663,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5679,12 +5679,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -5696,11 +5696,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -5712,11 +5712,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5727,11 +5727,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5745,13 +5745,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5762,11 +5762,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5780,13 +5780,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5797,11 +5797,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5815,13 +5815,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5833,12 +5833,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5851,12 +5851,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5870,13 +5870,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5888,12 +5888,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5906,12 +5906,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5925,13 +5925,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5943,11 +5943,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5962,13 +5962,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5981,12 +5981,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5999,12 +5999,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6019,13 +6019,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6038,11 +6038,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6057,12 +6057,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6073,10 +6073,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6087,10 +6087,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6101,13 +6101,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6117,10 +6117,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6131,10 +6131,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -6147,12 +6147,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -6164,11 +6164,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -6180,11 +6180,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6195,11 +6195,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6213,13 +6213,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6230,11 +6230,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6248,13 +6248,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6265,11 +6265,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6283,13 +6283,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6301,12 +6301,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6319,12 +6319,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6338,13 +6338,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6356,12 +6356,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6374,12 +6374,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6393,13 +6393,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6411,11 +6411,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6430,13 +6430,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6449,12 +6449,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6467,12 +6467,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6487,13 +6487,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6506,11 +6506,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6525,12 +6525,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6541,10 +6541,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6555,10 +6555,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6569,13 +6569,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6585,10 +6585,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -6599,10 +6599,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -6615,12 +6615,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -6632,11 +6632,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -6648,11 +6648,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6663,11 +6663,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6681,13 +6681,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6698,11 +6698,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6716,13 +6716,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6733,11 +6733,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6751,13 +6751,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6769,12 +6769,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6787,12 +6787,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6806,13 +6806,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6824,12 +6824,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6842,12 +6842,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6861,13 +6861,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6879,11 +6879,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6898,13 +6898,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6917,12 +6917,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6935,12 +6935,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6955,13 +6955,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6974,11 +6974,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6993,12 +6993,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -7009,10 +7009,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -7023,10 +7023,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -7037,13 +7037,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7053,10 +7053,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7067,10 +7067,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7083,12 +7083,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -7100,11 +7100,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7116,11 +7116,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7131,11 +7131,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7149,13 +7149,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7166,11 +7166,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7184,13 +7184,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7201,11 +7201,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7219,13 +7219,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7237,12 +7237,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7255,12 +7255,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7274,13 +7274,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7292,12 +7292,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7310,12 +7310,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7329,13 +7329,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -7347,11 +7347,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7366,13 +7366,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7385,12 +7385,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7403,12 +7403,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7423,13 +7423,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -7442,11 +7442,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7461,12 +7461,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7477,10 +7477,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7491,10 +7491,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -7505,13 +7505,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7521,10 +7521,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7535,10 +7535,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7551,12 +7551,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -7568,11 +7568,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7584,11 +7584,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7599,11 +7599,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7617,13 +7617,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7634,11 +7634,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7652,13 +7652,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7669,11 +7669,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7687,13 +7687,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7705,12 +7705,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7723,12 +7723,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7742,13 +7742,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7760,12 +7760,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7778,12 +7778,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7797,13 +7797,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -7815,11 +7815,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7834,13 +7834,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7853,12 +7853,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7871,12 +7871,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7891,13 +7891,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -7910,11 +7910,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7929,12 +7929,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7945,10 +7945,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7959,10 +7959,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7973,13 +7973,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7989,10 +7989,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8003,10 +8003,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8019,12 +8019,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -8036,11 +8036,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8052,11 +8052,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8067,11 +8067,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8085,13 +8085,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -8102,11 +8102,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8120,13 +8120,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -8137,11 +8137,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8155,13 +8155,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8173,12 +8173,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8191,12 +8191,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8210,13 +8210,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8228,12 +8228,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8246,12 +8246,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8265,13 +8265,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -8283,11 +8283,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8302,13 +8302,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8321,12 +8321,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8339,12 +8339,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8359,13 +8359,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -8378,11 +8378,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8397,12 +8397,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8413,10 +8413,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8427,10 +8427,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8441,13 +8441,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8457,10 +8457,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8471,10 +8471,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8487,12 +8487,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -8504,11 +8504,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8520,11 +8520,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8535,11 +8535,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8553,13 +8553,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -8570,11 +8570,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8588,13 +8588,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -8605,11 +8605,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8623,13 +8623,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8641,12 +8641,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8659,12 +8659,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8678,13 +8678,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8696,12 +8696,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8714,12 +8714,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8733,13 +8733,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -8751,11 +8751,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8770,13 +8770,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8789,12 +8789,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8807,12 +8807,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8827,13 +8827,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -8846,11 +8846,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -8865,12 +8865,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8881,10 +8881,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8895,10 +8895,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8909,13 +8909,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8925,10 +8925,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8939,10 +8939,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8955,12 +8955,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -8972,11 +8972,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8988,11 +8988,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9003,11 +9003,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9021,13 +9021,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9038,11 +9038,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9056,13 +9056,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -9073,11 +9073,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9091,13 +9091,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9109,12 +9109,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9127,12 +9127,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9146,13 +9146,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9164,12 +9164,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9182,12 +9182,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9201,13 +9201,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -9219,11 +9219,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9238,13 +9238,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9257,12 +9257,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9275,12 +9275,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9295,13 +9295,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -9314,11 +9314,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9333,12 +9333,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9349,10 +9349,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9363,10 +9363,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9377,13 +9377,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9393,10 +9393,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9407,10 +9407,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -9423,12 +9423,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -9440,11 +9440,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -9456,11 +9456,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9471,11 +9471,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9489,13 +9489,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9506,11 +9506,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9524,13 +9524,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -9541,11 +9541,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9559,13 +9559,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9577,12 +9577,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9595,12 +9595,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9614,13 +9614,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9632,12 +9632,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9650,12 +9650,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9669,13 +9669,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -9687,11 +9687,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9706,13 +9706,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9725,12 +9725,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9743,12 +9743,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9763,13 +9763,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -9782,11 +9782,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -9801,12 +9801,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9817,10 +9817,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9831,10 +9831,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9845,13 +9845,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9861,10 +9861,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9875,10 +9875,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -9891,12 +9891,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -9908,11 +9908,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -9924,11 +9924,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9939,11 +9939,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9957,13 +9957,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -9974,11 +9974,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9992,13 +9992,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10009,11 +10009,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10027,13 +10027,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10045,12 +10045,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10063,12 +10063,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10082,13 +10082,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10100,12 +10100,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10118,12 +10118,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10137,13 +10137,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -10155,11 +10155,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10174,13 +10174,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10193,12 +10193,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10211,12 +10211,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10231,13 +10231,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -10250,11 +10250,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10269,12 +10269,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10285,10 +10285,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10299,10 +10299,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10313,13 +10313,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10329,10 +10329,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10343,10 +10343,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -10359,12 +10359,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -10376,11 +10376,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -10392,11 +10392,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10407,11 +10407,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10425,13 +10425,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -10442,11 +10442,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10460,13 +10460,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10477,11 +10477,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10495,13 +10495,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10513,12 +10513,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10531,12 +10531,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10550,13 +10550,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10568,12 +10568,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10586,12 +10586,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10605,13 +10605,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -10623,11 +10623,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10642,13 +10642,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10661,12 +10661,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10679,12 +10679,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10699,13 +10699,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -10718,11 +10718,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10737,12 +10737,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10753,10 +10753,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10767,10 +10767,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10781,13 +10781,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10797,10 +10797,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -10811,10 +10811,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -10827,12 +10827,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -10844,11 +10844,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -10860,11 +10860,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10875,11 +10875,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10893,13 +10893,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -10910,11 +10910,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10928,13 +10928,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -10945,11 +10945,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -10963,13 +10963,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10981,12 +10981,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -10999,12 +10999,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11018,13 +11018,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11036,12 +11036,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11054,12 +11054,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11073,13 +11073,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -11091,11 +11091,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11110,13 +11110,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11129,12 +11129,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11147,12 +11147,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11167,13 +11167,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -11186,11 +11186,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11205,12 +11205,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11221,10 +11221,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11235,10 +11235,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11249,13 +11249,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11265,10 +11265,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11279,10 +11279,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -11295,12 +11295,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -11312,11 +11312,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -11328,11 +11328,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11343,11 +11343,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11361,13 +11361,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11378,11 +11378,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11396,13 +11396,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11413,11 +11413,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11431,13 +11431,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11449,12 +11449,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11467,12 +11467,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11486,13 +11486,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11504,12 +11504,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11522,12 +11522,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11541,13 +11541,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -11559,11 +11559,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11578,13 +11578,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11597,12 +11597,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11615,12 +11615,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11635,13 +11635,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -11654,11 +11654,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11673,12 +11673,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11689,10 +11689,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11703,10 +11703,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11717,7 +11717,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
@@ -15935,7 +15935,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15945,10 +15945,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15959,10 +15959,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -15975,12 +15975,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -15992,11 +15992,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16008,11 +16008,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16023,11 +16023,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16041,13 +16041,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -16058,11 +16058,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16076,13 +16076,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -16093,11 +16093,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16111,13 +16111,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16129,12 +16129,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16147,12 +16147,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16166,13 +16166,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16184,12 +16184,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16202,12 +16202,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16221,13 +16221,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -16239,11 +16239,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16258,13 +16258,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16277,12 +16277,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16295,12 +16295,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16315,13 +16315,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -16334,11 +16334,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16353,12 +16353,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16369,10 +16369,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16383,10 +16383,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16397,13 +16397,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16413,10 +16413,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16427,10 +16427,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -16443,12 +16443,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -16460,11 +16460,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16476,11 +16476,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16491,11 +16491,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16509,13 +16509,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -16526,11 +16526,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16544,13 +16544,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -16561,11 +16561,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16579,13 +16579,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16597,12 +16597,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16615,12 +16615,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16634,13 +16634,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16652,12 +16652,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16670,12 +16670,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16689,13 +16689,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -16707,11 +16707,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16726,13 +16726,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16745,12 +16745,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16763,12 +16763,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16783,13 +16783,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -16802,11 +16802,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16821,12 +16821,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16837,10 +16837,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16851,10 +16851,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16865,13 +16865,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -16881,10 +16881,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -16895,10 +16895,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -16911,12 +16911,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -16928,11 +16928,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16944,11 +16944,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16959,11 +16959,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16977,13 +16977,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -16994,11 +16994,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17012,13 +17012,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -17029,11 +17029,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17047,13 +17047,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17065,12 +17065,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17083,12 +17083,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17102,13 +17102,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17120,12 +17120,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17138,12 +17138,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17157,13 +17157,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -17175,11 +17175,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17194,13 +17194,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17213,12 +17213,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17231,12 +17231,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17251,13 +17251,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -17270,11 +17270,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17289,12 +17289,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -17305,10 +17305,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -17319,10 +17319,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(1)
@@ -17333,13 +17333,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -17349,10 +17349,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -17363,10 +17363,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -17379,12 +17379,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -17396,11 +17396,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -17412,11 +17412,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17427,11 +17427,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17445,13 +17445,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17462,11 +17462,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17480,13 +17480,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -17497,11 +17497,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17515,13 +17515,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17533,12 +17533,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17551,12 +17551,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17570,13 +17570,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17588,12 +17588,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17606,12 +17606,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17625,13 +17625,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -17643,11 +17643,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17662,13 +17662,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17681,12 +17681,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17699,12 +17699,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17719,13 +17719,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -17738,11 +17738,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON_V8;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17757,12 +17757,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -17773,10 +17773,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -17787,10 +17787,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON_V8;
     GemmMicrokernelTester()
       .mr(2)
@@ -17801,7 +17801,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
diff --git a/test/qs8-igemm-minmax-fp32.yaml b/test/qs8-igemm-minmax-fp32.yaml
index aabba65..c486df6 100644
--- a/test/qs8-igemm-minmax-fp32.yaml
+++ b/test/qs8-igemm-minmax-fp32.yaml
@@ -3,79 +3,79 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4s2__neonv8_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
@@ -105,16 +105,16 @@
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neondot
diff --git a/test/qs8-igemm-minmax-gemmlowp.cc b/test/qs8-igemm-minmax-gemmlowp.cc
index 0b67ac2..8c707fd 100644
--- a/test/qs8-igemm-minmax-gemmlowp.cc
+++ b/test/qs8-igemm-minmax-gemmlowp.cc
@@ -23,7 +23,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33,10 +33,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -47,10 +47,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -63,12 +63,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -80,11 +80,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -96,11 +96,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -111,11 +111,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -129,13 +129,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -146,11 +146,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -164,13 +164,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -181,11 +181,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -199,13 +199,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -217,12 +217,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -235,12 +235,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -254,13 +254,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -272,12 +272,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -290,12 +290,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -309,13 +309,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -327,11 +327,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -346,13 +346,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -365,12 +365,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -383,12 +383,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -403,13 +403,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -422,11 +422,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -441,12 +441,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -457,10 +457,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -471,10 +471,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -485,13 +485,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -501,10 +501,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -515,10 +515,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -531,12 +531,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -548,11 +548,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -564,11 +564,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -579,11 +579,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -597,13 +597,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -614,11 +614,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -632,13 +632,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -649,11 +649,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -667,13 +667,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -685,12 +685,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -703,12 +703,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -722,13 +722,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -740,12 +740,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -758,12 +758,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -777,13 +777,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -795,11 +795,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -814,13 +814,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -833,12 +833,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -851,12 +851,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -871,13 +871,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -890,11 +890,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -909,12 +909,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -925,10 +925,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -939,10 +939,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -953,13 +953,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -969,10 +969,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -983,10 +983,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -999,12 +999,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -1016,11 +1016,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1032,11 +1032,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1047,11 +1047,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1065,13 +1065,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1082,11 +1082,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1100,13 +1100,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1117,11 +1117,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1135,13 +1135,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1153,12 +1153,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1171,12 +1171,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1190,13 +1190,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1208,12 +1208,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1226,12 +1226,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1245,13 +1245,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -1263,11 +1263,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1282,13 +1282,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1301,12 +1301,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1319,12 +1319,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1339,13 +1339,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -1358,11 +1358,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1377,12 +1377,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1393,10 +1393,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1407,10 +1407,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1421,13 +1421,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1437,10 +1437,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1451,10 +1451,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1467,12 +1467,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -1484,11 +1484,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1500,11 +1500,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1515,11 +1515,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1533,13 +1533,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -1550,11 +1550,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1568,13 +1568,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -1585,11 +1585,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1603,13 +1603,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1621,12 +1621,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1639,12 +1639,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1658,13 +1658,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1676,12 +1676,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1694,12 +1694,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1713,13 +1713,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -1731,11 +1731,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1750,13 +1750,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1769,12 +1769,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1787,12 +1787,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -1807,13 +1807,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -1826,11 +1826,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -1845,12 +1845,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1861,10 +1861,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1875,10 +1875,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1889,13 +1889,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1905,10 +1905,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -1919,10 +1919,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1935,12 +1935,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -1952,11 +1952,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1968,11 +1968,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1983,11 +1983,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2001,13 +2001,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2018,11 +2018,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2036,13 +2036,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2053,11 +2053,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2071,13 +2071,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2089,12 +2089,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2107,12 +2107,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2126,13 +2126,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2144,12 +2144,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2162,12 +2162,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2181,13 +2181,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -2199,11 +2199,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2218,13 +2218,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2237,12 +2237,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2255,12 +2255,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2275,13 +2275,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -2294,11 +2294,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2313,12 +2313,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2329,10 +2329,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2343,10 +2343,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2357,13 +2357,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2373,10 +2373,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2387,10 +2387,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -2403,12 +2403,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -2420,11 +2420,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -2436,11 +2436,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2451,11 +2451,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2469,13 +2469,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2486,11 +2486,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2504,13 +2504,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2521,11 +2521,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2539,13 +2539,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2557,12 +2557,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2575,12 +2575,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2594,13 +2594,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2612,12 +2612,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2630,12 +2630,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2649,13 +2649,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -2667,11 +2667,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2686,13 +2686,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2705,12 +2705,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2723,12 +2723,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2743,13 +2743,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -2762,11 +2762,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -2781,12 +2781,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2797,10 +2797,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2811,10 +2811,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2825,13 +2825,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2841,10 +2841,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2855,10 +2855,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -2871,12 +2871,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -2888,11 +2888,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -2904,11 +2904,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2919,11 +2919,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2937,13 +2937,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -2954,11 +2954,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2972,13 +2972,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -2989,11 +2989,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3007,13 +3007,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3025,12 +3025,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3043,12 +3043,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3062,13 +3062,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3080,12 +3080,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3098,12 +3098,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3117,13 +3117,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -3135,11 +3135,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3154,13 +3154,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3173,12 +3173,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3191,12 +3191,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -3211,13 +3211,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -3230,11 +3230,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3249,12 +3249,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -3265,10 +3265,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -3279,10 +3279,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -3293,13 +3293,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3309,10 +3309,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3323,10 +3323,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3339,12 +3339,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -3356,11 +3356,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3372,11 +3372,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3387,11 +3387,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3405,13 +3405,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3422,11 +3422,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3440,13 +3440,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3457,11 +3457,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3475,13 +3475,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3493,12 +3493,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3511,12 +3511,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3530,13 +3530,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3548,12 +3548,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3566,12 +3566,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3585,13 +3585,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -3603,11 +3603,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3622,13 +3622,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3641,12 +3641,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3659,12 +3659,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3679,13 +3679,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -3698,11 +3698,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3717,12 +3717,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3733,10 +3733,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3747,10 +3747,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3761,13 +3761,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3777,10 +3777,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3791,10 +3791,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3807,12 +3807,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -3824,11 +3824,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3840,11 +3840,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3855,11 +3855,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3873,13 +3873,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3890,11 +3890,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3908,13 +3908,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3925,11 +3925,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3943,13 +3943,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3961,12 +3961,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3979,12 +3979,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3998,13 +3998,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4016,12 +4016,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4034,12 +4034,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4053,13 +4053,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4071,11 +4071,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4090,13 +4090,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4109,12 +4109,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4127,12 +4127,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4147,13 +4147,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4166,11 +4166,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4185,12 +4185,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4201,10 +4201,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4215,10 +4215,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4229,13 +4229,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4245,10 +4245,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4259,10 +4259,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4275,12 +4275,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -4292,11 +4292,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4308,11 +4308,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4323,11 +4323,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4341,13 +4341,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4358,11 +4358,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4376,13 +4376,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4393,11 +4393,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4411,13 +4411,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4429,12 +4429,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4447,12 +4447,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4466,13 +4466,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4484,12 +4484,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4502,12 +4502,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4521,13 +4521,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4539,11 +4539,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4558,13 +4558,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4577,12 +4577,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4595,12 +4595,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4615,13 +4615,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4634,11 +4634,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4653,12 +4653,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4669,10 +4669,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4683,10 +4683,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4697,13 +4697,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4713,10 +4713,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4727,10 +4727,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4743,12 +4743,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -4760,11 +4760,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4776,11 +4776,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4791,11 +4791,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4809,13 +4809,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4826,11 +4826,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4844,13 +4844,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4861,11 +4861,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4879,13 +4879,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4897,12 +4897,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4915,12 +4915,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4934,13 +4934,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4952,12 +4952,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4970,12 +4970,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4989,13 +4989,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5007,11 +5007,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5026,13 +5026,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5045,12 +5045,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5063,12 +5063,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5083,13 +5083,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5102,11 +5102,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5121,12 +5121,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5137,10 +5137,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5151,10 +5151,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5165,7 +5165,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM64
 
@@ -12191,7 +12191,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -12201,10 +12201,10 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -12215,10 +12215,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -12231,12 +12231,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -12248,11 +12248,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -12264,11 +12264,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -12279,11 +12279,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12297,13 +12297,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -12314,11 +12314,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12332,13 +12332,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -12349,11 +12349,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12367,13 +12367,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12385,12 +12385,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12403,12 +12403,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12422,13 +12422,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12440,12 +12440,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12458,12 +12458,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12477,13 +12477,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -12495,11 +12495,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12514,13 +12514,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12533,12 +12533,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12551,12 +12551,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12571,13 +12571,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -12590,11 +12590,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12609,12 +12609,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -12625,10 +12625,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -12639,10 +12639,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -12653,13 +12653,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -12669,10 +12669,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -12683,10 +12683,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -12699,12 +12699,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -12716,11 +12716,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -12732,11 +12732,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -12747,11 +12747,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -12765,13 +12765,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -12782,11 +12782,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -12800,13 +12800,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -12817,11 +12817,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -12835,13 +12835,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12853,12 +12853,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12871,12 +12871,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12890,13 +12890,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12908,12 +12908,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12926,12 +12926,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12945,13 +12945,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -12963,11 +12963,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -12982,13 +12982,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13001,12 +13001,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13019,12 +13019,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13039,13 +13039,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -13058,11 +13058,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13077,12 +13077,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13093,10 +13093,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13107,10 +13107,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13121,13 +13121,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -13137,10 +13137,10 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -13151,10 +13151,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -13167,12 +13167,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -13184,11 +13184,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -13200,11 +13200,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -13215,11 +13215,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -13233,13 +13233,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -13250,11 +13250,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -13268,13 +13268,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -13285,11 +13285,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -13303,13 +13303,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13321,12 +13321,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13339,12 +13339,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13358,13 +13358,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13376,12 +13376,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13394,12 +13394,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13413,13 +13413,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -13431,11 +13431,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -13450,13 +13450,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13469,12 +13469,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13487,12 +13487,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -13507,13 +13507,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -13526,11 +13526,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13545,12 +13545,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -13561,10 +13561,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -13575,10 +13575,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -13589,13 +13589,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -13605,10 +13605,10 @@
       .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -13619,10 +13619,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -13635,12 +13635,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -13652,11 +13652,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -13668,11 +13668,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -13683,11 +13683,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13701,13 +13701,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -13718,11 +13718,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13736,13 +13736,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -13753,11 +13753,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13771,13 +13771,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13789,12 +13789,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13807,12 +13807,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13826,13 +13826,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13844,12 +13844,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13862,12 +13862,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13881,13 +13881,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -13899,11 +13899,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13918,13 +13918,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13937,12 +13937,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13955,12 +13955,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13975,13 +13975,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -13994,11 +13994,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14013,12 +14013,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14029,10 +14029,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14043,10 +14043,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14057,13 +14057,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14073,10 +14073,10 @@
       .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14087,10 +14087,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -14103,12 +14103,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -14120,11 +14120,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -14136,11 +14136,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -14151,11 +14151,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14169,13 +14169,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14186,11 +14186,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14204,13 +14204,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -14221,11 +14221,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14239,13 +14239,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14257,12 +14257,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14275,12 +14275,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14294,13 +14294,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14312,12 +14312,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14330,12 +14330,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14349,13 +14349,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -14367,11 +14367,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14386,13 +14386,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14405,12 +14405,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14423,12 +14423,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14443,13 +14443,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -14462,11 +14462,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14481,12 +14481,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14497,10 +14497,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14511,10 +14511,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -14525,13 +14525,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14541,10 +14541,10 @@
       .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14555,10 +14555,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -14571,12 +14571,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -14588,11 +14588,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -14604,11 +14604,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -14619,11 +14619,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -14637,13 +14637,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14654,11 +14654,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -14672,13 +14672,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -14689,11 +14689,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -14707,13 +14707,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14725,12 +14725,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14743,12 +14743,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14762,13 +14762,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14780,12 +14780,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14798,12 +14798,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14817,13 +14817,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -14835,11 +14835,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -14854,13 +14854,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14873,12 +14873,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14891,12 +14891,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -14911,13 +14911,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -14930,11 +14930,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14949,12 +14949,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14965,10 +14965,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14979,10 +14979,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14993,13 +14993,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15009,10 +15009,10 @@
       .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15023,10 +15023,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -15039,12 +15039,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -15056,11 +15056,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -15072,11 +15072,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15087,11 +15087,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15105,13 +15105,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15122,11 +15122,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15140,13 +15140,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15157,11 +15157,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15175,13 +15175,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15193,12 +15193,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15211,12 +15211,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15230,13 +15230,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15248,12 +15248,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15266,12 +15266,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15285,13 +15285,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -15303,11 +15303,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15322,13 +15322,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15341,12 +15341,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15359,12 +15359,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15379,13 +15379,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -15398,11 +15398,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15417,12 +15417,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15433,10 +15433,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15447,10 +15447,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15461,13 +15461,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -15477,10 +15477,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -15491,10 +15491,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -15507,12 +15507,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -15524,11 +15524,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -15540,11 +15540,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15555,11 +15555,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -15573,13 +15573,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15590,11 +15590,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -15608,13 +15608,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15625,11 +15625,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -15643,13 +15643,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15661,12 +15661,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15679,12 +15679,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15698,13 +15698,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15716,12 +15716,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15734,12 +15734,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15753,13 +15753,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -15771,11 +15771,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -15790,13 +15790,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15809,12 +15809,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15827,12 +15827,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -15847,13 +15847,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -15866,11 +15866,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15885,12 +15885,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -15901,10 +15901,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -15915,10 +15915,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -15929,13 +15929,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15945,10 +15945,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15959,10 +15959,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -15975,12 +15975,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -15992,11 +15992,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16008,11 +16008,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16023,11 +16023,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16041,13 +16041,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -16058,11 +16058,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16076,13 +16076,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -16093,11 +16093,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16111,13 +16111,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16129,12 +16129,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16147,12 +16147,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16166,13 +16166,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16184,12 +16184,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16202,12 +16202,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16221,13 +16221,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -16239,11 +16239,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16258,13 +16258,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16277,12 +16277,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16295,12 +16295,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16315,13 +16315,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -16334,11 +16334,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16353,12 +16353,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16369,10 +16369,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16383,10 +16383,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16397,13 +16397,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16413,10 +16413,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16427,10 +16427,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -16443,12 +16443,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -16460,11 +16460,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16476,11 +16476,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16491,11 +16491,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16509,13 +16509,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -16526,11 +16526,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16544,13 +16544,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -16561,11 +16561,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16579,13 +16579,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16597,12 +16597,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16615,12 +16615,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16634,13 +16634,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16652,12 +16652,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16670,12 +16670,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16689,13 +16689,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -16707,11 +16707,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16726,13 +16726,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16745,12 +16745,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16763,12 +16763,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16783,13 +16783,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -16802,11 +16802,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -16821,12 +16821,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16837,10 +16837,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16851,10 +16851,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -16865,13 +16865,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -16881,10 +16881,10 @@
       .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -16895,10 +16895,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -16911,12 +16911,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -16928,11 +16928,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16944,11 +16944,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16959,11 +16959,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -16977,13 +16977,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -16994,11 +16994,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17012,13 +17012,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -17029,11 +17029,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17047,13 +17047,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17065,12 +17065,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17083,12 +17083,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17102,13 +17102,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17120,12 +17120,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17138,12 +17138,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17157,13 +17157,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -17175,11 +17175,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17194,13 +17194,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17213,12 +17213,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17231,12 +17231,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17251,13 +17251,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -17270,11 +17270,11 @@
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17289,12 +17289,12 @@
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17305,10 +17305,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17319,10 +17319,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17333,13 +17333,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17349,10 +17349,10 @@
       .m(4)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17363,10 +17363,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -17379,12 +17379,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -17396,11 +17396,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -17412,11 +17412,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17427,11 +17427,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -17445,13 +17445,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17462,11 +17462,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -17480,13 +17480,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -17497,11 +17497,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -17515,13 +17515,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17533,12 +17533,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17551,12 +17551,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17570,13 +17570,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17588,12 +17588,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17606,12 +17606,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17625,13 +17625,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -17643,11 +17643,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -17662,13 +17662,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17681,12 +17681,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17699,12 +17699,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -17719,13 +17719,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -17738,11 +17738,11 @@
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -17757,12 +17757,12 @@
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17773,10 +17773,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17787,10 +17787,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -17801,13 +17801,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17817,10 +17817,10 @@
       .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17831,10 +17831,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -17847,12 +17847,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -17864,11 +17864,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -17880,11 +17880,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17895,11 +17895,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17913,13 +17913,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -17930,11 +17930,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17948,13 +17948,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -17965,11 +17965,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17983,13 +17983,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18001,12 +18001,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18019,12 +18019,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18038,13 +18038,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18056,12 +18056,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18074,12 +18074,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18093,13 +18093,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -18111,11 +18111,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18130,13 +18130,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18149,12 +18149,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18167,12 +18167,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18187,13 +18187,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -18206,11 +18206,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18225,12 +18225,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18241,10 +18241,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18255,10 +18255,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18269,13 +18269,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18285,10 +18285,10 @@
       .m(2)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18299,10 +18299,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -18315,12 +18315,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -18332,11 +18332,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -18348,11 +18348,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18363,11 +18363,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18381,13 +18381,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18398,11 +18398,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18416,13 +18416,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18433,11 +18433,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18451,13 +18451,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18469,12 +18469,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18487,12 +18487,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18506,13 +18506,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18524,12 +18524,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18542,12 +18542,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18561,13 +18561,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -18579,11 +18579,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18598,13 +18598,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18617,12 +18617,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18635,12 +18635,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18655,13 +18655,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -18674,11 +18674,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18693,12 +18693,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18709,10 +18709,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18723,10 +18723,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -18737,13 +18737,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -18753,10 +18753,10 @@
       .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -18767,10 +18767,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -18783,12 +18783,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -18800,11 +18800,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -18816,11 +18816,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18831,11 +18831,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -18849,13 +18849,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18866,11 +18866,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -18884,13 +18884,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18901,11 +18901,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -18919,13 +18919,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18937,12 +18937,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18955,12 +18955,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18974,13 +18974,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18992,12 +18992,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19010,12 +19010,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19029,13 +19029,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -19047,11 +19047,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19066,13 +19066,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19085,12 +19085,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19103,12 +19103,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19123,13 +19123,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -19142,11 +19142,11 @@
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19161,12 +19161,12 @@
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19177,10 +19177,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19191,10 +19191,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19205,13 +19205,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19221,10 +19221,10 @@
       .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19235,10 +19235,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -19251,12 +19251,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -19268,11 +19268,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -19284,11 +19284,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19299,11 +19299,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -19317,13 +19317,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -19334,11 +19334,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -19352,13 +19352,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -19369,11 +19369,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -19387,13 +19387,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19405,12 +19405,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19423,12 +19423,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19442,13 +19442,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19460,12 +19460,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19478,12 +19478,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19497,13 +19497,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -19515,11 +19515,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -19534,13 +19534,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19553,12 +19553,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19571,12 +19571,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -19591,13 +19591,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -19610,11 +19610,11 @@
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19629,12 +19629,12 @@
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19645,10 +19645,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19659,10 +19659,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -19673,13 +19673,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -19689,10 +19689,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -19703,10 +19703,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -19719,12 +19719,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -19736,11 +19736,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -19752,11 +19752,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19767,11 +19767,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -19785,13 +19785,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -19802,11 +19802,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -19820,13 +19820,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -19837,11 +19837,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -19855,13 +19855,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19873,12 +19873,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19891,12 +19891,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19910,13 +19910,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19928,12 +19928,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19946,12 +19946,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19965,13 +19965,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -19983,11 +19983,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20002,13 +20002,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20021,12 +20021,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20039,12 +20039,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20059,13 +20059,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -20078,11 +20078,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20097,12 +20097,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20113,10 +20113,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20127,10 +20127,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20141,13 +20141,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20157,10 +20157,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20171,10 +20171,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -20187,12 +20187,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -20204,11 +20204,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -20220,11 +20220,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20235,11 +20235,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20253,13 +20253,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -20270,11 +20270,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20288,13 +20288,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -20305,11 +20305,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20323,13 +20323,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20341,12 +20341,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20359,12 +20359,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20378,13 +20378,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20396,12 +20396,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20414,12 +20414,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20433,13 +20433,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -20451,11 +20451,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20470,13 +20470,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20489,12 +20489,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20507,12 +20507,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20527,13 +20527,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -20546,11 +20546,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20565,12 +20565,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20581,10 +20581,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20595,10 +20595,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -20609,13 +20609,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -20625,10 +20625,10 @@
       .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -20639,10 +20639,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -20655,12 +20655,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -20672,11 +20672,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -20688,11 +20688,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20703,11 +20703,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -20721,13 +20721,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -20738,11 +20738,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -20756,13 +20756,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -20773,11 +20773,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -20791,13 +20791,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20809,12 +20809,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20827,12 +20827,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20846,13 +20846,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20864,12 +20864,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20882,12 +20882,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20901,13 +20901,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -20919,11 +20919,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -20938,13 +20938,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20957,12 +20957,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20975,12 +20975,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -20995,13 +20995,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -21014,11 +21014,11 @@
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21033,12 +21033,12 @@
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21049,10 +21049,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21063,10 +21063,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X8C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21077,13 +21077,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21093,10 +21093,10 @@
       .m(4)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21107,10 +21107,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -21123,12 +21123,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -21140,11 +21140,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -21156,11 +21156,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21171,11 +21171,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21189,13 +21189,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -21206,11 +21206,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21224,13 +21224,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -21241,11 +21241,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21259,13 +21259,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21277,12 +21277,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21295,12 +21295,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21314,13 +21314,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21332,12 +21332,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21350,12 +21350,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21369,13 +21369,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -21387,11 +21387,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21406,13 +21406,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21425,12 +21425,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21443,12 +21443,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21463,13 +21463,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -21482,11 +21482,11 @@
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21501,12 +21501,12 @@
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21517,10 +21517,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21531,10 +21531,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X8C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21545,13 +21545,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21561,10 +21561,10 @@
       .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21575,10 +21575,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -21591,12 +21591,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -21608,11 +21608,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -21624,11 +21624,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21639,11 +21639,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21657,13 +21657,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -21674,11 +21674,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21692,13 +21692,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -21709,11 +21709,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21727,13 +21727,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21745,12 +21745,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21763,12 +21763,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21782,13 +21782,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21800,12 +21800,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21818,12 +21818,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21837,13 +21837,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -21855,11 +21855,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21874,13 +21874,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21893,12 +21893,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21911,12 +21911,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21931,13 +21931,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -21950,11 +21950,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21969,12 +21969,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21985,10 +21985,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21999,10 +21999,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X16C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -22013,13 +22013,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22029,10 +22029,10 @@
       .m(2)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22043,10 +22043,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -22059,12 +22059,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -22076,11 +22076,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -22092,11 +22092,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22107,11 +22107,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22125,13 +22125,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -22142,11 +22142,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22160,13 +22160,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -22177,11 +22177,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22195,13 +22195,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22213,12 +22213,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22231,12 +22231,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22250,13 +22250,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22268,12 +22268,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22286,12 +22286,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22305,13 +22305,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -22323,11 +22323,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22342,13 +22342,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22361,12 +22361,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22379,12 +22379,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22399,13 +22399,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -22418,11 +22418,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22437,12 +22437,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22453,10 +22453,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22467,10 +22467,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X16C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22481,13 +22481,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22497,10 +22497,10 @@
       .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22511,10 +22511,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -22527,12 +22527,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -22544,11 +22544,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -22560,11 +22560,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22575,11 +22575,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22593,13 +22593,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -22610,11 +22610,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22628,13 +22628,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -22645,11 +22645,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22663,13 +22663,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22681,12 +22681,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22699,12 +22699,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22718,13 +22718,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22736,12 +22736,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22754,12 +22754,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22773,13 +22773,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -22791,11 +22791,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22810,13 +22810,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22829,12 +22829,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22847,12 +22847,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22867,13 +22867,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -22886,11 +22886,11 @@
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22905,12 +22905,12 @@
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22921,10 +22921,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22935,10 +22935,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_3X16C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22949,13 +22949,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -22965,10 +22965,10 @@
       .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -22979,10 +22979,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -22995,12 +22995,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -23012,11 +23012,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -23028,11 +23028,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23043,11 +23043,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23061,13 +23061,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -23078,11 +23078,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23096,13 +23096,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -23113,11 +23113,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23131,13 +23131,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23149,12 +23149,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23167,12 +23167,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23186,13 +23186,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23204,12 +23204,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23222,12 +23222,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23241,13 +23241,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -23259,11 +23259,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23278,13 +23278,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23297,12 +23297,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23315,12 +23315,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23335,13 +23335,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -23354,11 +23354,11 @@
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -23373,12 +23373,12 @@
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23389,10 +23389,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23403,10 +23403,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C16__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23417,7 +23417,7 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
diff --git a/test/qs8-igemm-minmax-gemmlowp.yaml b/test/qs8-igemm-minmax-gemmlowp.yaml
index 3ab478c..9198de2 100644
--- a/test/qs8-igemm-minmax-gemmlowp.yaml
+++ b/test/qs8-igemm-minmax-gemmlowp.yaml
@@ -3,37 +3,37 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
 - name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
@@ -81,76 +81,76 @@
 - name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x16__neon_mlal_lane
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
 - name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8__neon_mull_addw_dup
diff --git a/test/qs8-igemm-minmax-rndnu.cc b/test/qs8-igemm-minmax-rndnu.cc
index c9e3a9f..7ec8ba9 100644
--- a/test/qs8-igemm-minmax-rndnu.cc
+++ b/test/qs8-igemm-minmax-rndnu.cc
@@ -23,7 +23,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33,10 +33,10 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -47,10 +47,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -63,12 +63,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -80,11 +80,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -96,11 +96,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -111,11 +111,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -129,13 +129,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -146,11 +146,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -164,13 +164,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -181,11 +181,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -199,13 +199,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -217,12 +217,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -235,12 +235,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -254,13 +254,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -272,12 +272,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -290,12 +290,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -309,13 +309,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -327,11 +327,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -346,13 +346,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -365,12 +365,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -383,12 +383,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -403,13 +403,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -422,11 +422,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -441,12 +441,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -457,10 +457,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -471,10 +471,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -485,13 +485,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -501,10 +501,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -515,10 +515,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -531,12 +531,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -548,11 +548,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -564,11 +564,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -579,11 +579,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -597,13 +597,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -614,11 +614,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -632,13 +632,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -649,11 +649,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -667,13 +667,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -685,12 +685,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -703,12 +703,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -722,13 +722,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -740,12 +740,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -758,12 +758,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -777,13 +777,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -795,11 +795,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -814,13 +814,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -833,12 +833,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -851,12 +851,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -871,13 +871,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -890,11 +890,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -909,12 +909,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -925,10 +925,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -939,10 +939,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -953,13 +953,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -969,10 +969,10 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -983,10 +983,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -999,12 +999,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -1016,11 +1016,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1032,11 +1032,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1047,11 +1047,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1065,13 +1065,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1082,11 +1082,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1100,13 +1100,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -1117,11 +1117,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1135,13 +1135,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1153,12 +1153,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1171,12 +1171,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1190,13 +1190,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1208,12 +1208,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1226,12 +1226,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1245,13 +1245,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -1263,11 +1263,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1282,13 +1282,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1301,12 +1301,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1319,12 +1319,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1339,13 +1339,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -1358,11 +1358,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1377,12 +1377,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -1393,10 +1393,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -1407,10 +1407,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -1421,13 +1421,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -1437,10 +1437,10 @@
       .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -1451,10 +1451,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1467,12 +1467,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -1484,11 +1484,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1500,11 +1500,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1515,11 +1515,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1533,13 +1533,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1550,11 +1550,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1568,13 +1568,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -1585,11 +1585,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1603,13 +1603,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1621,12 +1621,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1639,12 +1639,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1658,13 +1658,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1676,12 +1676,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1694,12 +1694,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1713,13 +1713,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -1731,11 +1731,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1750,13 +1750,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1769,12 +1769,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1787,12 +1787,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1807,13 +1807,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -1826,11 +1826,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1845,12 +1845,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -1861,10 +1861,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -1875,10 +1875,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -1889,13 +1889,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1905,10 +1905,10 @@
       .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -1919,10 +1919,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -1935,12 +1935,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -1952,11 +1952,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -1968,11 +1968,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1983,11 +1983,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2001,13 +2001,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2018,11 +2018,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2036,13 +2036,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2053,11 +2053,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2071,13 +2071,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2089,12 +2089,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2107,12 +2107,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2126,13 +2126,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2144,12 +2144,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2162,12 +2162,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2181,13 +2181,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -2199,11 +2199,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2218,13 +2218,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2237,12 +2237,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2255,12 +2255,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2275,13 +2275,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -2294,11 +2294,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2313,12 +2313,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2329,10 +2329,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2343,10 +2343,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -2357,13 +2357,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2373,10 +2373,10 @@
       .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2387,10 +2387,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -2403,12 +2403,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -2420,11 +2420,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -2436,11 +2436,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -2451,11 +2451,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2469,13 +2469,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2486,11 +2486,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2504,13 +2504,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2521,11 +2521,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2539,13 +2539,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2557,12 +2557,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2575,12 +2575,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2594,13 +2594,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2612,12 +2612,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2630,12 +2630,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2649,13 +2649,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -2667,11 +2667,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2686,13 +2686,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2705,12 +2705,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2723,12 +2723,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2743,13 +2743,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -2762,11 +2762,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2781,12 +2781,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2797,10 +2797,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2811,10 +2811,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -2825,13 +2825,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -2841,10 +2841,10 @@
       .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -2855,10 +2855,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -2871,12 +2871,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -2888,11 +2888,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -2904,11 +2904,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -2919,11 +2919,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -2937,13 +2937,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2954,11 +2954,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -2972,13 +2972,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2989,11 +2989,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -3007,13 +3007,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3025,12 +3025,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3043,12 +3043,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3062,13 +3062,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3080,12 +3080,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3098,12 +3098,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3117,13 +3117,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -3135,11 +3135,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -3154,13 +3154,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3173,12 +3173,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3191,12 +3191,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -3211,13 +3211,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -3230,11 +3230,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3249,12 +3249,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -3265,10 +3265,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -3279,10 +3279,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -3293,13 +3293,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -3309,10 +3309,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -3323,10 +3323,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -3339,12 +3339,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -3356,11 +3356,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -3372,11 +3372,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -3387,11 +3387,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3405,13 +3405,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3422,11 +3422,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3440,13 +3440,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -3457,11 +3457,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3475,13 +3475,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3493,12 +3493,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3511,12 +3511,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3530,13 +3530,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3548,12 +3548,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3566,12 +3566,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3585,13 +3585,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -3603,11 +3603,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3622,13 +3622,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3641,12 +3641,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3659,12 +3659,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3679,13 +3679,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -3698,11 +3698,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3717,12 +3717,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -3733,10 +3733,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -3747,10 +3747,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -3761,13 +3761,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3777,10 +3777,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3791,10 +3791,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -3807,12 +3807,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -3824,11 +3824,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -3840,11 +3840,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3855,11 +3855,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3873,13 +3873,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -3890,11 +3890,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3908,13 +3908,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -3925,11 +3925,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3943,13 +3943,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3961,12 +3961,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3979,12 +3979,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -3998,13 +3998,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4016,12 +4016,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4034,12 +4034,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4053,13 +4053,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4071,11 +4071,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4090,13 +4090,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4109,12 +4109,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4127,12 +4127,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4147,13 +4147,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4166,11 +4166,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4185,12 +4185,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4201,10 +4201,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4215,10 +4215,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -4229,13 +4229,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4245,10 +4245,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4259,10 +4259,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4275,12 +4275,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -4292,11 +4292,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4308,11 +4308,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4323,11 +4323,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4341,13 +4341,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4358,11 +4358,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4376,13 +4376,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4393,11 +4393,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4411,13 +4411,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4429,12 +4429,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4447,12 +4447,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4466,13 +4466,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4484,12 +4484,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4502,12 +4502,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4521,13 +4521,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4539,11 +4539,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4558,13 +4558,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4577,12 +4577,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4595,12 +4595,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4615,13 +4615,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -4634,11 +4634,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4653,12 +4653,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4669,10 +4669,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4683,10 +4683,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -4697,13 +4697,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -4713,10 +4713,10 @@
       .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -4727,10 +4727,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -4743,12 +4743,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -4760,11 +4760,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -4776,11 +4776,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4791,11 +4791,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -4809,13 +4809,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -4826,11 +4826,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -4844,13 +4844,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -4861,11 +4861,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -4879,13 +4879,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4897,12 +4897,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4915,12 +4915,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4934,13 +4934,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4952,12 +4952,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4970,12 +4970,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -4989,13 +4989,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5007,11 +5007,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -5026,13 +5026,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5045,12 +5045,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5063,12 +5063,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -5083,13 +5083,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5102,11 +5102,11 @@
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5121,12 +5121,12 @@
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -5137,10 +5137,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -5151,10 +5151,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -5165,13 +5165,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -5181,10 +5181,10 @@
       .m(4)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -5195,10 +5195,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5211,12 +5211,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -5228,11 +5228,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -5244,11 +5244,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5259,11 +5259,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5277,13 +5277,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5294,11 +5294,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5312,13 +5312,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5329,11 +5329,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5347,13 +5347,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5365,12 +5365,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5383,12 +5383,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5402,13 +5402,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5420,12 +5420,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5438,12 +5438,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5457,13 +5457,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5475,11 +5475,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5494,13 +5494,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5513,12 +5513,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5531,12 +5531,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5551,13 +5551,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5570,11 +5570,11 @@
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5589,12 +5589,12 @@
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -5605,10 +5605,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -5619,10 +5619,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -5633,13 +5633,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5649,10 +5649,10 @@
       .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5663,10 +5663,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -5679,12 +5679,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -5696,11 +5696,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -5712,11 +5712,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5727,11 +5727,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5745,13 +5745,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -5762,11 +5762,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5780,13 +5780,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -5797,11 +5797,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5815,13 +5815,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5833,12 +5833,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5851,12 +5851,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5870,13 +5870,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5888,12 +5888,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5906,12 +5906,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5925,13 +5925,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -5943,11 +5943,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5962,13 +5962,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5981,12 +5981,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -5999,12 +5999,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6019,13 +6019,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6038,11 +6038,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6057,12 +6057,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6073,10 +6073,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6087,10 +6087,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6101,13 +6101,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6117,10 +6117,10 @@
       .m(2)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6131,10 +6131,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -6147,12 +6147,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -6164,11 +6164,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -6180,11 +6180,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6195,11 +6195,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6213,13 +6213,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6230,11 +6230,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6248,13 +6248,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6265,11 +6265,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6283,13 +6283,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6301,12 +6301,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6319,12 +6319,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6338,13 +6338,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6356,12 +6356,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6374,12 +6374,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6393,13 +6393,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6411,11 +6411,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6430,13 +6430,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6449,12 +6449,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6467,12 +6467,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6487,13 +6487,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6506,11 +6506,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6525,12 +6525,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6541,10 +6541,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6555,10 +6555,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6569,13 +6569,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -6585,10 +6585,10 @@
       .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -6599,10 +6599,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -6615,12 +6615,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -6632,11 +6632,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -6648,11 +6648,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6663,11 +6663,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6681,13 +6681,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -6698,11 +6698,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6716,13 +6716,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -6733,11 +6733,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6751,13 +6751,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6769,12 +6769,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6787,12 +6787,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6806,13 +6806,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6824,12 +6824,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6842,12 +6842,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6861,13 +6861,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6879,11 +6879,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6898,13 +6898,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6917,12 +6917,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6935,12 +6935,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6955,13 +6955,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -6974,11 +6974,11 @@
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -6993,12 +6993,12 @@
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -7009,10 +7009,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -7023,10 +7023,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -7037,13 +7037,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7053,10 +7053,10 @@
       .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7067,10 +7067,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -7083,12 +7083,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -7100,11 +7100,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -7116,11 +7116,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7131,11 +7131,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7149,13 +7149,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -7166,11 +7166,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7184,13 +7184,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -7201,11 +7201,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7219,13 +7219,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7237,12 +7237,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7255,12 +7255,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7274,13 +7274,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7292,12 +7292,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7310,12 +7310,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7329,13 +7329,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -7347,11 +7347,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7366,13 +7366,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7385,12 +7385,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7403,12 +7403,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7423,13 +7423,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -7442,11 +7442,11 @@
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -7461,12 +7461,12 @@
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7477,10 +7477,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7491,10 +7491,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7505,13 +7505,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -7521,10 +7521,10 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -7535,10 +7535,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7551,12 +7551,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -7568,11 +7568,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7584,11 +7584,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -7599,11 +7599,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7617,13 +7617,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7634,11 +7634,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7652,13 +7652,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -7669,11 +7669,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7687,13 +7687,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7705,12 +7705,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7723,12 +7723,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7742,13 +7742,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7760,12 +7760,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7778,12 +7778,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7797,13 +7797,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -7815,11 +7815,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7834,13 +7834,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7853,12 +7853,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7871,12 +7871,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7891,13 +7891,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -7910,11 +7910,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7929,12 +7929,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -7945,10 +7945,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -7959,10 +7959,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -7973,13 +7973,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -7989,10 +7989,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8003,10 +8003,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8019,12 +8019,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -8036,11 +8036,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8052,11 +8052,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -8067,11 +8067,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8085,13 +8085,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8102,11 +8102,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8120,13 +8120,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -8137,11 +8137,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8155,13 +8155,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8173,12 +8173,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8191,12 +8191,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8210,13 +8210,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8228,12 +8228,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8246,12 +8246,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8265,13 +8265,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -8283,11 +8283,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8302,13 +8302,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8321,12 +8321,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8339,12 +8339,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8359,13 +8359,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -8378,11 +8378,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8397,12 +8397,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8413,10 +8413,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8427,10 +8427,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -8441,13 +8441,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -8457,10 +8457,10 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -8471,10 +8471,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8487,12 +8487,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -8504,11 +8504,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8520,11 +8520,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -8535,11 +8535,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8553,13 +8553,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8570,11 +8570,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8588,13 +8588,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -8605,11 +8605,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8623,13 +8623,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8641,12 +8641,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8659,12 +8659,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8678,13 +8678,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8696,12 +8696,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8714,12 +8714,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8733,13 +8733,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -8751,11 +8751,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8770,13 +8770,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8789,12 +8789,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8807,12 +8807,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8827,13 +8827,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -8846,11 +8846,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8865,12 +8865,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -8881,10 +8881,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -8895,10 +8895,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -8909,13 +8909,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8925,10 +8925,10 @@
       .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8939,10 +8939,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8955,12 +8955,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -8972,11 +8972,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8988,11 +8988,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9003,11 +9003,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9021,13 +9021,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9038,11 +9038,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9056,13 +9056,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -9073,11 +9073,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9091,13 +9091,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9109,12 +9109,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9127,12 +9127,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9146,13 +9146,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9164,12 +9164,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9182,12 +9182,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9201,13 +9201,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -9219,11 +9219,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9238,13 +9238,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9257,12 +9257,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9275,12 +9275,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9295,13 +9295,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -9314,11 +9314,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9333,12 +9333,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9349,10 +9349,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9363,10 +9363,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9377,13 +9377,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9393,10 +9393,10 @@
       .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9407,10 +9407,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -9423,12 +9423,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -9440,11 +9440,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -9456,11 +9456,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9471,11 +9471,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9489,13 +9489,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9506,11 +9506,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9524,13 +9524,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -9541,11 +9541,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9559,13 +9559,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9577,12 +9577,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9595,12 +9595,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9614,13 +9614,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9632,12 +9632,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9650,12 +9650,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9669,13 +9669,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -9687,11 +9687,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9706,13 +9706,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9725,12 +9725,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9743,12 +9743,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9763,13 +9763,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -9782,11 +9782,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9801,12 +9801,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9817,10 +9817,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9831,10 +9831,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -9845,13 +9845,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9861,10 +9861,10 @@
       .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -9875,10 +9875,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -9891,12 +9891,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -9908,11 +9908,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -9924,11 +9924,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9939,11 +9939,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9957,13 +9957,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9974,11 +9974,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9992,13 +9992,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -10009,11 +10009,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -10027,13 +10027,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10045,12 +10045,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10063,12 +10063,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10082,13 +10082,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10100,12 +10100,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10118,12 +10118,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10137,13 +10137,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -10155,11 +10155,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -10174,13 +10174,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10193,12 +10193,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10211,12 +10211,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -10231,13 +10231,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -10250,11 +10250,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10269,12 +10269,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -10285,10 +10285,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -10299,10 +10299,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -10313,13 +10313,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -10329,10 +10329,10 @@
       .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -10343,10 +10343,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -10359,12 +10359,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -10376,11 +10376,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -10392,11 +10392,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -10407,11 +10407,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10425,13 +10425,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10442,11 +10442,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10460,13 +10460,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -10477,11 +10477,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10495,13 +10495,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10513,12 +10513,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10531,12 +10531,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10550,13 +10550,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10568,12 +10568,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10586,12 +10586,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10605,13 +10605,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -10623,11 +10623,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10642,13 +10642,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10661,12 +10661,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10679,12 +10679,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10699,13 +10699,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -10718,11 +10718,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10737,12 +10737,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -10753,10 +10753,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -10767,10 +10767,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -10781,13 +10781,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -10797,10 +10797,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -10811,10 +10811,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -10827,12 +10827,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -10844,11 +10844,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -10860,11 +10860,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -10875,11 +10875,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10893,13 +10893,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10910,11 +10910,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10928,13 +10928,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -10945,11 +10945,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10963,13 +10963,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10981,12 +10981,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10999,12 +10999,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11018,13 +11018,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11036,12 +11036,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11054,12 +11054,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11073,13 +11073,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -11091,11 +11091,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11110,13 +11110,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11129,12 +11129,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11147,12 +11147,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11167,13 +11167,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -11186,11 +11186,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11205,12 +11205,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -11221,10 +11221,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -11235,10 +11235,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -11249,13 +11249,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11265,10 +11265,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11279,10 +11279,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -11295,12 +11295,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -11312,11 +11312,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -11328,11 +11328,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11343,11 +11343,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11361,13 +11361,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11378,11 +11378,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11396,13 +11396,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11413,11 +11413,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11431,13 +11431,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11449,12 +11449,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11467,12 +11467,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11486,13 +11486,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11504,12 +11504,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11522,12 +11522,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11541,13 +11541,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -11559,11 +11559,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11578,13 +11578,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11597,12 +11597,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11615,12 +11615,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11635,13 +11635,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -11654,11 +11654,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11673,12 +11673,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11689,10 +11689,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11703,10 +11703,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -11717,13 +11717,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -11733,10 +11733,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -11747,10 +11747,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -11763,12 +11763,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -11780,11 +11780,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -11796,11 +11796,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11811,11 +11811,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -11829,13 +11829,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -11846,11 +11846,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -11864,13 +11864,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -11881,11 +11881,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -11899,13 +11899,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11917,12 +11917,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11935,12 +11935,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11954,13 +11954,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11972,12 +11972,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -11990,12 +11990,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12009,13 +12009,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -12027,11 +12027,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -12046,13 +12046,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12065,12 +12065,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12083,12 +12083,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -12103,13 +12103,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -12122,11 +12122,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12141,12 +12141,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -12157,10 +12157,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -12171,10 +12171,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -12185,13 +12185,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -12201,10 +12201,10 @@
       .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -12215,10 +12215,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -12231,12 +12231,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -12248,11 +12248,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -12264,11 +12264,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -12279,11 +12279,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12297,13 +12297,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -12314,11 +12314,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12332,13 +12332,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -12349,11 +12349,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12367,13 +12367,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12385,12 +12385,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12403,12 +12403,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12422,13 +12422,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12440,12 +12440,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12458,12 +12458,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12477,13 +12477,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -12495,11 +12495,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12514,13 +12514,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12533,12 +12533,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12551,12 +12551,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12571,13 +12571,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -12590,11 +12590,11 @@
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12609,12 +12609,12 @@
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -12625,10 +12625,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -12639,10 +12639,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -12653,13 +12653,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -12669,10 +12669,10 @@
       .m(4)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -12683,10 +12683,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -12699,12 +12699,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -12716,11 +12716,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -12732,11 +12732,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -12747,11 +12747,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12765,13 +12765,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -12782,11 +12782,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12800,13 +12800,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -12817,11 +12817,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12835,13 +12835,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12853,12 +12853,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12871,12 +12871,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12890,13 +12890,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12908,12 +12908,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12926,12 +12926,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -12945,13 +12945,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -12963,11 +12963,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12982,13 +12982,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13001,12 +13001,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13019,12 +13019,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13039,13 +13039,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -13058,11 +13058,11 @@
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13077,12 +13077,12 @@
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -13093,10 +13093,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -13107,10 +13107,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -13121,13 +13121,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -13137,10 +13137,10 @@
       .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -13151,10 +13151,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -13167,12 +13167,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -13184,11 +13184,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -13200,11 +13200,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -13215,11 +13215,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13233,13 +13233,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -13250,11 +13250,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13268,13 +13268,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -13285,11 +13285,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13303,13 +13303,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13321,12 +13321,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13339,12 +13339,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13358,13 +13358,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13376,12 +13376,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13394,12 +13394,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13413,13 +13413,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -13431,11 +13431,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13450,13 +13450,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13469,12 +13469,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13487,12 +13487,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13507,13 +13507,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -13526,11 +13526,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13545,12 +13545,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -13561,10 +13561,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -13575,10 +13575,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -13589,13 +13589,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13605,10 +13605,10 @@
       .m(2)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13619,10 +13619,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -13635,12 +13635,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -13652,11 +13652,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -13668,11 +13668,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -13683,11 +13683,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13701,13 +13701,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -13718,11 +13718,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13736,13 +13736,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -13753,11 +13753,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13771,13 +13771,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13789,12 +13789,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13807,12 +13807,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13826,13 +13826,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13844,12 +13844,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13862,12 +13862,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13881,13 +13881,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -13899,11 +13899,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13918,13 +13918,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13937,12 +13937,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -13955,12 +13955,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13975,13 +13975,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -13994,11 +13994,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14013,12 +14013,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14029,10 +14029,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14043,10 +14043,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14057,13 +14057,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -14073,10 +14073,10 @@
       .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -14087,10 +14087,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -14103,12 +14103,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -14120,11 +14120,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -14136,11 +14136,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14151,11 +14151,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -14169,13 +14169,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -14186,11 +14186,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -14204,13 +14204,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -14221,11 +14221,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -14239,13 +14239,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14257,12 +14257,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14275,12 +14275,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14294,13 +14294,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14312,12 +14312,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14330,12 +14330,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14349,13 +14349,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -14367,11 +14367,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -14386,13 +14386,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14405,12 +14405,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14423,12 +14423,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -14443,13 +14443,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -14462,11 +14462,11 @@
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14481,12 +14481,12 @@
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -14497,10 +14497,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -14511,10 +14511,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -14525,13 +14525,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14541,10 +14541,10 @@
       .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14555,10 +14555,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -14571,12 +14571,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -14588,11 +14588,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -14604,11 +14604,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14619,11 +14619,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14637,13 +14637,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -14654,11 +14654,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14672,13 +14672,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -14689,11 +14689,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14707,13 +14707,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14725,12 +14725,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14743,12 +14743,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14762,13 +14762,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14780,12 +14780,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14798,12 +14798,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14817,13 +14817,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -14835,11 +14835,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14854,13 +14854,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14873,12 +14873,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14891,12 +14891,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14911,13 +14911,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -14930,11 +14930,11 @@
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -14949,12 +14949,12 @@
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14965,10 +14965,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14979,10 +14979,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -14993,13 +14993,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15009,10 +15009,10 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15023,10 +15023,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -15039,12 +15039,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -15056,11 +15056,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -15072,11 +15072,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15087,11 +15087,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -15105,13 +15105,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15122,11 +15122,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -15140,13 +15140,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15157,11 +15157,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -15175,13 +15175,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15193,12 +15193,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15211,12 +15211,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15230,13 +15230,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15248,12 +15248,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15266,12 +15266,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15285,13 +15285,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -15303,11 +15303,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -15322,13 +15322,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15341,12 +15341,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15359,12 +15359,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -15379,13 +15379,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -15398,11 +15398,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15417,12 +15417,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15433,10 +15433,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15447,10 +15447,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -15461,13 +15461,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15477,10 +15477,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15491,10 +15491,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -15507,12 +15507,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -15524,11 +15524,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -15540,11 +15540,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15555,11 +15555,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15573,13 +15573,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15590,11 +15590,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15608,13 +15608,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15625,11 +15625,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15643,13 +15643,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15661,12 +15661,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15679,12 +15679,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15698,13 +15698,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15716,12 +15716,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15734,12 +15734,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15753,13 +15753,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -15771,11 +15771,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15790,13 +15790,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15809,12 +15809,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15827,12 +15827,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15847,13 +15847,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -15866,11 +15866,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15885,12 +15885,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15901,10 +15901,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15915,10 +15915,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -15929,13 +15929,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15945,10 +15945,10 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -15959,10 +15959,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -15975,12 +15975,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -15992,11 +15992,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16008,11 +16008,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16023,11 +16023,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -16041,13 +16041,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16058,11 +16058,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -16076,13 +16076,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -16093,11 +16093,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -16111,13 +16111,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16129,12 +16129,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16147,12 +16147,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16166,13 +16166,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16184,12 +16184,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16202,12 +16202,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16221,13 +16221,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -16239,11 +16239,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -16258,13 +16258,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16277,12 +16277,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16295,12 +16295,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -16315,13 +16315,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -16334,11 +16334,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16353,12 +16353,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -16369,10 +16369,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -16383,10 +16383,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -16397,13 +16397,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -16413,10 +16413,10 @@
       .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -16427,10 +16427,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -16443,12 +16443,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -16460,11 +16460,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16476,11 +16476,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16491,11 +16491,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -16509,13 +16509,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16526,11 +16526,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -16544,13 +16544,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -16561,11 +16561,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -16579,13 +16579,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16597,12 +16597,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16615,12 +16615,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16634,13 +16634,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16652,12 +16652,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16670,12 +16670,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16689,13 +16689,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -16707,11 +16707,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -16726,13 +16726,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16745,12 +16745,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16763,12 +16763,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -16783,13 +16783,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -16802,11 +16802,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16821,12 +16821,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -16837,10 +16837,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -16851,10 +16851,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -16865,13 +16865,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16881,10 +16881,10 @@
       .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16895,10 +16895,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -16911,12 +16911,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -16928,11 +16928,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -16944,11 +16944,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16959,11 +16959,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16977,13 +16977,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16994,11 +16994,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17012,13 +17012,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17029,11 +17029,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17047,13 +17047,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17065,12 +17065,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17083,12 +17083,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17102,13 +17102,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17120,12 +17120,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17138,12 +17138,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17157,13 +17157,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -17175,11 +17175,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17194,13 +17194,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17213,12 +17213,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17231,12 +17231,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17251,13 +17251,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -17270,11 +17270,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17289,12 +17289,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17305,10 +17305,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17319,10 +17319,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17333,13 +17333,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17349,10 +17349,10 @@
       .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17363,10 +17363,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -17379,12 +17379,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -17396,11 +17396,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -17412,11 +17412,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -17427,11 +17427,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17445,13 +17445,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17462,11 +17462,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17480,13 +17480,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17497,11 +17497,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17515,13 +17515,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17533,12 +17533,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17551,12 +17551,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17570,13 +17570,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17588,12 +17588,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17606,12 +17606,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17625,13 +17625,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -17643,11 +17643,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17662,13 +17662,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17681,12 +17681,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17699,12 +17699,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17719,13 +17719,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -17738,11 +17738,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17757,12 +17757,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17773,10 +17773,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17787,10 +17787,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -17801,13 +17801,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17817,10 +17817,10 @@
       .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -17831,10 +17831,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -17847,12 +17847,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -17864,11 +17864,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -17880,11 +17880,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -17895,11 +17895,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17913,13 +17913,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17930,11 +17930,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17948,13 +17948,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17965,11 +17965,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17983,13 +17983,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18001,12 +18001,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18019,12 +18019,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18038,13 +18038,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18056,12 +18056,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18074,12 +18074,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18093,13 +18093,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -18111,11 +18111,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -18130,13 +18130,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18149,12 +18149,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18167,12 +18167,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -18187,13 +18187,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -18206,11 +18206,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18225,12 +18225,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -18241,10 +18241,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -18255,10 +18255,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -18269,13 +18269,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -18285,10 +18285,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -18299,10 +18299,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -18315,12 +18315,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -18332,11 +18332,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -18348,11 +18348,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -18363,11 +18363,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -18381,13 +18381,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18398,11 +18398,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -18416,13 +18416,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -18433,11 +18433,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -18451,13 +18451,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18469,12 +18469,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18487,12 +18487,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18506,13 +18506,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18524,12 +18524,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18542,12 +18542,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18561,13 +18561,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -18579,11 +18579,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -18598,13 +18598,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18617,12 +18617,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18635,12 +18635,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -18655,13 +18655,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -18674,11 +18674,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18693,12 +18693,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -18709,10 +18709,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -18723,10 +18723,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -18737,13 +18737,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18753,10 +18753,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18767,10 +18767,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18783,12 +18783,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -18800,11 +18800,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -18816,11 +18816,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18831,11 +18831,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18849,13 +18849,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18866,11 +18866,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18884,13 +18884,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18901,11 +18901,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18919,13 +18919,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18937,12 +18937,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18955,12 +18955,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18974,13 +18974,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18992,12 +18992,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19010,12 +19010,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19029,13 +19029,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -19047,11 +19047,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -19066,13 +19066,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19085,12 +19085,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19103,12 +19103,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -19123,13 +19123,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -19142,11 +19142,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19161,12 +19161,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -19177,10 +19177,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -19191,10 +19191,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -19205,13 +19205,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -19221,10 +19221,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -19235,10 +19235,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -19251,12 +19251,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -19268,11 +19268,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -19284,11 +19284,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19299,11 +19299,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -19317,13 +19317,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -19334,11 +19334,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -19352,13 +19352,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -19369,11 +19369,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -19387,13 +19387,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19405,12 +19405,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19423,12 +19423,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19442,13 +19442,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19460,12 +19460,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19478,12 +19478,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19497,13 +19497,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -19515,11 +19515,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -19534,13 +19534,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19553,12 +19553,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19571,12 +19571,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -19591,13 +19591,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -19610,11 +19610,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19629,12 +19629,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -19645,10 +19645,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -19659,10 +19659,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -19673,13 +19673,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19689,10 +19689,10 @@
       .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -19703,10 +19703,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -19719,12 +19719,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -19736,11 +19736,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -19752,11 +19752,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19767,11 +19767,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19785,13 +19785,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -19802,11 +19802,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19820,13 +19820,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -19837,11 +19837,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19855,13 +19855,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19873,12 +19873,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19891,12 +19891,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19910,13 +19910,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19928,12 +19928,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19946,12 +19946,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -19965,13 +19965,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -19983,11 +19983,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -20002,13 +20002,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20021,12 +20021,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20039,12 +20039,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -20059,13 +20059,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -20078,11 +20078,11 @@
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20097,12 +20097,12 @@
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -20113,10 +20113,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -20127,10 +20127,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -20141,13 +20141,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -20157,10 +20157,10 @@
       .m(4)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -20171,10 +20171,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -20187,12 +20187,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -20204,11 +20204,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -20220,11 +20220,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20235,11 +20235,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -20253,13 +20253,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -20270,11 +20270,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -20288,13 +20288,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -20305,11 +20305,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -20323,13 +20323,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20341,12 +20341,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20359,12 +20359,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20378,13 +20378,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20396,12 +20396,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20414,12 +20414,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20433,13 +20433,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -20451,11 +20451,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -20470,13 +20470,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20489,12 +20489,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20507,12 +20507,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -20527,13 +20527,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -20546,11 +20546,11 @@
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20565,12 +20565,12 @@
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -20581,10 +20581,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -20595,10 +20595,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -20609,13 +20609,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20625,10 +20625,10 @@
       .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -20639,10 +20639,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -20655,12 +20655,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -20672,11 +20672,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -20688,11 +20688,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20703,11 +20703,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20721,13 +20721,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -20738,11 +20738,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20756,13 +20756,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -20773,11 +20773,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20791,13 +20791,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20809,12 +20809,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20827,12 +20827,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20846,13 +20846,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20864,12 +20864,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20882,12 +20882,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20901,13 +20901,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -20919,11 +20919,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20938,13 +20938,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20957,12 +20957,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -20975,12 +20975,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20995,13 +20995,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -21014,11 +21014,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21033,12 +21033,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21049,10 +21049,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21063,10 +21063,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -21077,13 +21077,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -21093,10 +21093,10 @@
       .m(2)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -21107,10 +21107,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -21123,12 +21123,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -21140,11 +21140,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -21156,11 +21156,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21171,11 +21171,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -21189,13 +21189,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -21206,11 +21206,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -21224,13 +21224,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -21241,11 +21241,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -21259,13 +21259,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21277,12 +21277,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21295,12 +21295,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21314,13 +21314,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21332,12 +21332,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21350,12 +21350,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21369,13 +21369,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -21387,11 +21387,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -21406,13 +21406,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21425,12 +21425,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21443,12 +21443,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -21463,13 +21463,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -21482,11 +21482,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21501,12 +21501,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -21517,10 +21517,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -21531,10 +21531,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -21545,13 +21545,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21561,10 +21561,10 @@
       .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21575,10 +21575,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -21591,12 +21591,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -21608,11 +21608,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -21624,11 +21624,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21639,11 +21639,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21657,13 +21657,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -21674,11 +21674,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21692,13 +21692,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -21709,11 +21709,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21727,13 +21727,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21745,12 +21745,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21763,12 +21763,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21782,13 +21782,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21800,12 +21800,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21818,12 +21818,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21837,13 +21837,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -21855,11 +21855,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21874,13 +21874,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21893,12 +21893,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21911,12 +21911,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21931,13 +21931,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -21950,11 +21950,11 @@
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -21969,12 +21969,12 @@
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21985,10 +21985,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -21999,10 +21999,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -22013,13 +22013,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -22029,10 +22029,10 @@
       .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -22043,10 +22043,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -22059,12 +22059,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -22076,11 +22076,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -22092,11 +22092,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22107,11 +22107,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -22125,13 +22125,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -22142,11 +22142,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -22160,13 +22160,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -22177,11 +22177,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -22195,13 +22195,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22213,12 +22213,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22231,12 +22231,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22250,13 +22250,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22268,12 +22268,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22286,12 +22286,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22305,13 +22305,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -22323,11 +22323,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -22342,13 +22342,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22361,12 +22361,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22379,12 +22379,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -22399,13 +22399,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -22418,11 +22418,11 @@
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -22437,12 +22437,12 @@
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -22453,10 +22453,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -22467,10 +22467,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -22481,13 +22481,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -22497,10 +22497,10 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -22511,10 +22511,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -22527,12 +22527,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -22544,11 +22544,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -22560,11 +22560,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -22575,11 +22575,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -22593,13 +22593,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22610,11 +22610,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -22628,13 +22628,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -22645,11 +22645,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -22663,13 +22663,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22681,12 +22681,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22699,12 +22699,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22718,13 +22718,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22736,12 +22736,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22754,12 +22754,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22773,13 +22773,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -22791,11 +22791,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -22810,13 +22810,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22829,12 +22829,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22847,12 +22847,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -22867,13 +22867,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -22886,11 +22886,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22905,12 +22905,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -22921,10 +22921,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -22935,10 +22935,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -22949,13 +22949,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22965,10 +22965,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -22979,10 +22979,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -22995,12 +22995,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -23012,11 +23012,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -23028,11 +23028,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -23043,11 +23043,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -23061,13 +23061,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23078,11 +23078,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -23096,13 +23096,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -23113,11 +23113,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -23131,13 +23131,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23149,12 +23149,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23167,12 +23167,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23186,13 +23186,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23204,12 +23204,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23222,12 +23222,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23241,13 +23241,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -23259,11 +23259,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -23278,13 +23278,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23297,12 +23297,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23315,12 +23315,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -23335,13 +23335,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -23354,11 +23354,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23373,12 +23373,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -23389,10 +23389,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -23403,10 +23403,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -23417,13 +23417,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -23433,10 +23433,10 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -23447,10 +23447,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -23463,12 +23463,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -23480,11 +23480,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -23496,11 +23496,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -23511,11 +23511,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -23529,13 +23529,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23546,11 +23546,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -23564,13 +23564,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -23581,11 +23581,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -23599,13 +23599,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23617,12 +23617,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23635,12 +23635,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23654,13 +23654,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23672,12 +23672,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23690,12 +23690,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23709,13 +23709,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -23727,11 +23727,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -23746,13 +23746,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23765,12 +23765,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23783,12 +23783,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -23803,13 +23803,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -23822,11 +23822,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23841,12 +23841,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -23857,10 +23857,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -23871,10 +23871,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -23885,13 +23885,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23901,10 +23901,10 @@
       .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -23915,10 +23915,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -23931,12 +23931,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -23948,11 +23948,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -23964,11 +23964,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -23979,11 +23979,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -23997,13 +23997,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24014,11 +24014,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -24032,13 +24032,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -24049,11 +24049,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -24067,13 +24067,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24085,12 +24085,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24103,12 +24103,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24122,13 +24122,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24140,12 +24140,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24158,12 +24158,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24177,13 +24177,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -24195,11 +24195,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -24214,13 +24214,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24233,12 +24233,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24251,12 +24251,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -24271,13 +24271,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -24290,11 +24290,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24309,12 +24309,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -24325,10 +24325,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -24339,10 +24339,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -24353,13 +24353,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -24369,10 +24369,10 @@
       .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -24383,10 +24383,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -24399,12 +24399,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -24416,11 +24416,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -24432,11 +24432,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -24447,11 +24447,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -24465,13 +24465,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24482,11 +24482,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -24500,13 +24500,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -24517,11 +24517,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -24535,13 +24535,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24553,12 +24553,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24571,12 +24571,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24590,13 +24590,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24608,12 +24608,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24626,12 +24626,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24645,13 +24645,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -24663,11 +24663,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -24682,13 +24682,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24701,12 +24701,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24719,12 +24719,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -24739,13 +24739,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -24758,11 +24758,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24777,12 +24777,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -24793,10 +24793,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -24807,10 +24807,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -24821,13 +24821,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -24837,10 +24837,10 @@
       .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -24851,10 +24851,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -24867,12 +24867,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -24884,11 +24884,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -24900,11 +24900,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -24915,11 +24915,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -24933,13 +24933,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24950,11 +24950,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -24968,13 +24968,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -24985,11 +24985,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -25003,13 +25003,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25021,12 +25021,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25039,12 +25039,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25058,13 +25058,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25076,12 +25076,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25094,12 +25094,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25113,13 +25113,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -25131,11 +25131,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -25150,13 +25150,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25169,12 +25169,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25187,12 +25187,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -25207,13 +25207,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -25226,11 +25226,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25245,12 +25245,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -25261,10 +25261,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -25275,10 +25275,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -25289,13 +25289,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -25305,10 +25305,10 @@
       .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -25319,10 +25319,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -25335,12 +25335,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -25352,11 +25352,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -25368,11 +25368,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -25383,11 +25383,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -25401,13 +25401,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -25418,11 +25418,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -25436,13 +25436,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -25453,11 +25453,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -25471,13 +25471,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25489,12 +25489,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25507,12 +25507,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25526,13 +25526,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25544,12 +25544,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25562,12 +25562,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25581,13 +25581,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -25599,11 +25599,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -25618,13 +25618,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25637,12 +25637,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25655,12 +25655,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -25675,13 +25675,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -25694,11 +25694,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25713,12 +25713,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -25729,10 +25729,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -25743,10 +25743,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -25757,13 +25757,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25773,10 +25773,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25787,10 +25787,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -25803,12 +25803,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -25820,11 +25820,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -25836,11 +25836,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -25851,11 +25851,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -25869,13 +25869,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -25886,11 +25886,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -25904,13 +25904,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -25921,11 +25921,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -25939,13 +25939,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25957,12 +25957,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25975,12 +25975,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25994,13 +25994,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26012,12 +26012,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26030,12 +26030,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26049,13 +26049,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -26067,11 +26067,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -26086,13 +26086,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26105,12 +26105,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26123,12 +26123,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -26143,13 +26143,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -26162,11 +26162,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26181,12 +26181,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -26197,10 +26197,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -26211,10 +26211,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -26225,13 +26225,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -26241,10 +26241,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -26255,10 +26255,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -26271,12 +26271,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -26288,11 +26288,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -26304,11 +26304,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -26319,11 +26319,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -26337,13 +26337,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -26354,11 +26354,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -26372,13 +26372,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -26389,11 +26389,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -26407,13 +26407,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26425,12 +26425,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26443,12 +26443,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26462,13 +26462,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26480,12 +26480,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26498,12 +26498,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26517,13 +26517,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -26535,11 +26535,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -26554,13 +26554,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26573,12 +26573,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26591,12 +26591,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -26611,13 +26611,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -26630,11 +26630,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26649,12 +26649,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -26665,10 +26665,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -26679,10 +26679,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -26693,13 +26693,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -26709,10 +26709,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -26723,10 +26723,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -26739,12 +26739,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -26756,11 +26756,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -26772,11 +26772,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -26787,11 +26787,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -26805,13 +26805,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -26822,11 +26822,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -26840,13 +26840,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -26857,11 +26857,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -26875,13 +26875,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26893,12 +26893,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26911,12 +26911,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26930,13 +26930,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26948,12 +26948,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26966,12 +26966,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -26985,13 +26985,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -27003,11 +27003,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -27022,13 +27022,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27041,12 +27041,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27059,12 +27059,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -27079,13 +27079,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -27098,11 +27098,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27117,12 +27117,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -27133,10 +27133,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -27147,10 +27147,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -27161,13 +27161,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -27177,10 +27177,10 @@
       .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -27191,10 +27191,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -27207,12 +27207,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -27224,11 +27224,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -27240,11 +27240,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -27255,11 +27255,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -27273,13 +27273,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -27290,11 +27290,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -27308,13 +27308,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -27325,11 +27325,11 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -27343,13 +27343,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27361,12 +27361,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27379,12 +27379,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27398,13 +27398,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27416,12 +27416,12 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27434,12 +27434,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27453,13 +27453,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -27471,11 +27471,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -27490,13 +27490,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27509,12 +27509,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27527,12 +27527,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -27547,13 +27547,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -27566,11 +27566,11 @@
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27585,12 +27585,12 @@
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -27601,10 +27601,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -27615,10 +27615,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -27629,13 +27629,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -27645,10 +27645,10 @@
       .m(4)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -27659,10 +27659,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -27675,12 +27675,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -27692,11 +27692,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -27708,11 +27708,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -27723,11 +27723,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -27741,13 +27741,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -27758,11 +27758,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -27776,13 +27776,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -27793,11 +27793,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -27811,13 +27811,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27829,12 +27829,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27847,12 +27847,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27866,13 +27866,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27884,12 +27884,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27902,12 +27902,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27921,13 +27921,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -27939,11 +27939,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -27958,13 +27958,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27977,12 +27977,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -27995,12 +27995,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -28015,13 +28015,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -28034,11 +28034,11 @@
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28053,12 +28053,12 @@
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -28069,10 +28069,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -28083,10 +28083,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -28097,13 +28097,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -28113,10 +28113,10 @@
       .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -28127,10 +28127,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -28143,12 +28143,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -28160,11 +28160,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -28176,11 +28176,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -28191,11 +28191,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -28209,13 +28209,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -28226,11 +28226,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -28244,13 +28244,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -28261,11 +28261,11 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -28279,13 +28279,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28297,12 +28297,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28315,12 +28315,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28334,13 +28334,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28352,12 +28352,12 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28370,12 +28370,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28389,13 +28389,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -28407,11 +28407,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -28426,13 +28426,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28445,12 +28445,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28463,12 +28463,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -28483,13 +28483,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -28502,11 +28502,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28521,12 +28521,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -28537,10 +28537,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -28551,10 +28551,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -28565,13 +28565,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -28581,10 +28581,10 @@
       .m(2)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -28595,10 +28595,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -28611,12 +28611,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -28628,11 +28628,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -28644,11 +28644,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -28659,11 +28659,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -28677,13 +28677,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -28694,11 +28694,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -28712,13 +28712,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -28729,11 +28729,11 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -28747,13 +28747,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28765,12 +28765,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28783,12 +28783,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28802,13 +28802,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28820,12 +28820,12 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28838,12 +28838,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28857,13 +28857,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -28875,11 +28875,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -28894,13 +28894,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28913,12 +28913,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28931,12 +28931,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -28951,13 +28951,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -28970,11 +28970,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -28989,12 +28989,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29005,10 +29005,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29019,10 +29019,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29033,13 +29033,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -29049,10 +29049,10 @@
       .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -29063,10 +29063,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -29079,12 +29079,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -29096,11 +29096,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -29112,11 +29112,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -29127,11 +29127,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -29145,13 +29145,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -29162,11 +29162,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -29180,13 +29180,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -29197,11 +29197,11 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -29215,13 +29215,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29233,12 +29233,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29251,12 +29251,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29270,13 +29270,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29288,12 +29288,12 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29306,12 +29306,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29325,13 +29325,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -29343,11 +29343,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -29362,13 +29362,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29381,12 +29381,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29399,12 +29399,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -29419,13 +29419,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -29438,11 +29438,11 @@
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29457,12 +29457,12 @@
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -29473,10 +29473,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -29487,10 +29487,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -29501,13 +29501,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -29517,10 +29517,10 @@
       .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -29531,10 +29531,10 @@
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -29547,12 +29547,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -29564,11 +29564,11 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -29580,11 +29580,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -29595,11 +29595,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -29613,13 +29613,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -29630,11 +29630,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -29648,13 +29648,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -29665,11 +29665,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -29683,13 +29683,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29701,12 +29701,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29719,12 +29719,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29738,13 +29738,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29756,12 +29756,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29774,12 +29774,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29793,13 +29793,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -29811,11 +29811,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -29830,13 +29830,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29849,12 +29849,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29867,12 +29867,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -29887,13 +29887,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -29906,11 +29906,11 @@
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -29925,12 +29925,12 @@
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -29941,10 +29941,10 @@
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -29955,10 +29955,10 @@
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -29969,13 +29969,13 @@
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29985,10 +29985,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -29999,10 +29999,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -30015,12 +30015,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -30032,11 +30032,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -30048,11 +30048,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -30063,11 +30063,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30081,13 +30081,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -30098,11 +30098,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30116,13 +30116,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -30133,11 +30133,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30151,13 +30151,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30169,12 +30169,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30187,12 +30187,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30206,13 +30206,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30224,12 +30224,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30242,12 +30242,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30261,13 +30261,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -30279,11 +30279,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30298,13 +30298,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30317,12 +30317,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30335,12 +30335,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30355,13 +30355,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -30374,11 +30374,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30393,12 +30393,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30409,10 +30409,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30423,10 +30423,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30437,13 +30437,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30453,10 +30453,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30467,10 +30467,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -30483,12 +30483,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -30500,11 +30500,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -30516,11 +30516,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -30531,11 +30531,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30549,13 +30549,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -30566,11 +30566,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30584,13 +30584,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -30601,11 +30601,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30619,13 +30619,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30637,12 +30637,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30655,12 +30655,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30674,13 +30674,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30692,12 +30692,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30710,12 +30710,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30729,13 +30729,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -30747,11 +30747,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30766,13 +30766,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30785,12 +30785,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30803,12 +30803,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -30823,13 +30823,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -30842,11 +30842,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -30861,12 +30861,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30877,10 +30877,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30891,10 +30891,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30905,13 +30905,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30921,10 +30921,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -30935,10 +30935,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -30951,12 +30951,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -30968,11 +30968,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -30984,11 +30984,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -30999,11 +30999,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31017,13 +31017,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -31034,11 +31034,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31052,13 +31052,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -31069,11 +31069,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31087,13 +31087,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31105,12 +31105,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31123,12 +31123,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31142,13 +31142,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31160,12 +31160,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31178,12 +31178,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31197,13 +31197,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -31215,11 +31215,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31234,13 +31234,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31253,12 +31253,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31271,12 +31271,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31291,13 +31291,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -31310,11 +31310,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31329,12 +31329,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31345,10 +31345,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31359,10 +31359,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31373,13 +31373,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31389,10 +31389,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31403,10 +31403,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -31419,12 +31419,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -31436,11 +31436,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -31452,11 +31452,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -31467,11 +31467,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31485,13 +31485,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -31502,11 +31502,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31520,13 +31520,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -31537,11 +31537,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31555,13 +31555,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31573,12 +31573,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31591,12 +31591,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31610,13 +31610,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31628,12 +31628,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31646,12 +31646,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31665,13 +31665,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -31683,11 +31683,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31702,13 +31702,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31721,12 +31721,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31739,12 +31739,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31759,13 +31759,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -31778,11 +31778,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -31797,12 +31797,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31813,10 +31813,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31827,10 +31827,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31841,13 +31841,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31857,10 +31857,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -31871,10 +31871,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -31887,12 +31887,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -31904,11 +31904,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -31920,11 +31920,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -31935,11 +31935,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31953,13 +31953,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -31970,11 +31970,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -31988,13 +31988,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -32005,11 +32005,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -32023,13 +32023,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32041,12 +32041,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32059,12 +32059,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32078,13 +32078,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32096,12 +32096,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32114,12 +32114,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32133,13 +32133,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -32151,11 +32151,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -32170,13 +32170,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32189,12 +32189,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32207,12 +32207,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -32227,13 +32227,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -32246,11 +32246,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32265,12 +32265,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -32281,10 +32281,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -32295,10 +32295,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -32309,13 +32309,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32325,10 +32325,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32339,10 +32339,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -32355,12 +32355,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -32372,11 +32372,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -32388,11 +32388,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -32403,11 +32403,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32421,13 +32421,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -32438,11 +32438,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32456,13 +32456,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -32473,11 +32473,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32491,13 +32491,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32509,12 +32509,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32527,12 +32527,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32546,13 +32546,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32564,12 +32564,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32582,12 +32582,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32601,13 +32601,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -32619,11 +32619,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32638,13 +32638,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32657,12 +32657,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32675,12 +32675,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32695,13 +32695,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -32714,11 +32714,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32733,12 +32733,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32749,10 +32749,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32763,10 +32763,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32777,13 +32777,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32793,10 +32793,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -32807,10 +32807,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -32823,12 +32823,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -32840,11 +32840,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -32856,11 +32856,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -32871,11 +32871,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32889,13 +32889,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -32906,11 +32906,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32924,13 +32924,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -32941,11 +32941,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -32959,13 +32959,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32977,12 +32977,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -32995,12 +32995,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33014,13 +33014,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33032,12 +33032,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33050,12 +33050,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33069,13 +33069,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -33087,11 +33087,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33106,13 +33106,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33125,12 +33125,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33143,12 +33143,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33163,13 +33163,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -33182,11 +33182,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33201,12 +33201,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33217,10 +33217,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33231,10 +33231,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33245,13 +33245,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33261,10 +33261,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33275,10 +33275,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -33291,12 +33291,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -33308,11 +33308,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -33324,11 +33324,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -33339,11 +33339,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33357,13 +33357,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -33374,11 +33374,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33392,13 +33392,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -33409,11 +33409,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33427,13 +33427,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33445,12 +33445,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33463,12 +33463,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33482,13 +33482,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33500,12 +33500,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33518,12 +33518,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33537,13 +33537,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -33555,11 +33555,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33574,13 +33574,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33593,12 +33593,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33611,12 +33611,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33631,13 +33631,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -33650,11 +33650,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33669,12 +33669,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33685,10 +33685,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33699,10 +33699,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33713,13 +33713,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33729,10 +33729,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -33743,10 +33743,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -33759,12 +33759,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -33776,11 +33776,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -33792,11 +33792,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -33807,11 +33807,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33825,13 +33825,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -33842,11 +33842,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33860,13 +33860,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -33877,11 +33877,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -33895,13 +33895,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33913,12 +33913,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33931,12 +33931,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33950,13 +33950,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33968,12 +33968,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -33986,12 +33986,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -34005,13 +34005,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -34023,11 +34023,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -34042,13 +34042,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -34061,12 +34061,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -34079,12 +34079,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -34099,13 +34099,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -34118,11 +34118,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -34137,12 +34137,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -34153,10 +34153,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -34167,10 +34167,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -34181,7 +34181,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64
 
@@ -36527,7 +36527,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -36537,10 +36537,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -36551,10 +36551,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -36567,12 +36567,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -36584,11 +36584,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -36600,11 +36600,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -36615,11 +36615,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -36633,13 +36633,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -36650,11 +36650,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -36668,13 +36668,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -36685,11 +36685,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -36703,13 +36703,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -36721,12 +36721,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -36739,12 +36739,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -36758,13 +36758,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -36776,12 +36776,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -36794,12 +36794,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -36813,13 +36813,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -36831,11 +36831,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -36850,13 +36850,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -36869,12 +36869,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -36887,12 +36887,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -36907,13 +36907,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -36926,11 +36926,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -36945,12 +36945,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -36961,10 +36961,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -36975,10 +36975,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -36989,13 +36989,13 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -37005,10 +37005,10 @@
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -37019,10 +37019,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -37035,12 +37035,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -37052,11 +37052,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -37068,11 +37068,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -37083,11 +37083,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -37101,13 +37101,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -37118,11 +37118,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -37136,13 +37136,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -37153,11 +37153,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -37171,13 +37171,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37189,12 +37189,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37207,12 +37207,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37226,13 +37226,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37244,12 +37244,12 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37262,12 +37262,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37281,13 +37281,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -37299,11 +37299,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -37318,13 +37318,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37337,12 +37337,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37355,12 +37355,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -37375,13 +37375,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -37394,11 +37394,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -37413,12 +37413,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -37429,10 +37429,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -37443,10 +37443,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL_PADAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -37457,7 +37457,7 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
diff --git a/test/qs8-igemm-minmax-rndnu.yaml b/test/qs8-igemm-minmax-rndnu.yaml
index a4fe4d07..3e6d3d9 100644
--- a/test/qs8-igemm-minmax-rndnu.yaml
+++ b/test/qs8-igemm-minmax-rndnu.yaml
@@ -3,223 +3,223 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
 - name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
@@ -237,10 +237,10 @@
 - name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal_padal
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16
 - name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot